def __init__(
            self,
            env_spec,
            hidden_sizes=(),
            hidden_nonlinearity=NL.tanh,
            num_seq_inputs=1,
            neat_output_dim=20,
            neat_network=None,
            prob_network=None,
    ):
        """
        :param env_spec: A spec for the mdp.
        :param hidden_sizes: list of sizes for the fully connected hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param prob_network: manually specified network for this policy, other network params
        are ignored
        :return:
        """
        Serializable.quick_init(self, locals())

        assert isinstance(env_spec.action_space, Discrete)
        # create random NEAT MLP
        if neat_network is None:
            neat_network = MLP(
                input_shape=(env_spec.observation_space.flat_dim * num_seq_inputs,),
                output_dim=neat_output_dim,
                hidden_sizes=(12, 12),
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=NL.identity,
            )

        if prob_network is None:
            prob_network = MLP(
                input_shape=(L.get_output_shape(neat_network.output_layer)[1],),
                output_dim=env_spec.action_space.n,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=NL.softmax,
            )

        self._phi = neat_network.output_layer
        self._obs = neat_network.input_layer
        self._neat_output = ext.compile_function([neat_network.input_layer.input_var], L.get_output(neat_network.output_layer))

        self.prob_network = prob_network
        self._l_prob = prob_network.output_layer
        self._l_obs = prob_network.input_layer
        self._f_prob = ext.compile_function([prob_network.input_layer.input_var], L.get_output(prob_network.output_layer))

        self._dist = Categorical(env_spec.action_space.n)

        super(PowerGradientPolicy, self).__init__(env_spec)
        LasagnePowered.__init__(self, [prob_network.output_layer])
Exemplo n.º 2
0
    def __init__(self,
                 env_spec,
                 hidden_sizes=(32, 32),
                 hidden_nonlinearity=NL.tanh,
                 output_b_init=None,
                 weight_signal=1.0,
                 weight_nonsignal=1.0,
                 weight_smc=1.0):
        """
        :param env_spec: A spec for the mdp.
        :param hidden_sizes: list of sizes for the fully connected hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :return:
        """
        Serializable.quick_init(self, locals())
        assert isinstance(env_spec.action_space, Discrete)
        output_b_init = compute_output_b_init(env_spec.action_space.names,
                                              output_b_init, weight_signal,
                                              weight_nonsignal, weight_smc)

        prob_network = MLP(input_shape=(env_spec.observation_space.flat_dim, ),
                           output_dim=env_spec.action_space.n,
                           hidden_sizes=hidden_sizes,
                           hidden_nonlinearity=hidden_nonlinearity,
                           output_nonlinearity=NL.softmax,
                           output_b_init=output_b_init)
        super(InitCategoricalMLPPolicy,
              self).__init__(env_spec, hidden_sizes, hidden_nonlinearity,
                             prob_network)
Exemplo n.º 3
0
def create_policy_rllab(policy, env, weights):
    # Create policy
    obs_dim = env.observation_space.flat_dim
    action_dim = env.action_space.flat_dim
    if policy == 'linear':
        hidden_sizes = tuple()
    elif policy == 'simple-nn':
        hidden_sizes = [16]
    else:
        raise Exception('NOT IMPLEMENTED.')
    # Creating the policy
    mean_network = MLP(
        input_shape=(obs_dim, ),
        output_dim=action_dim,
        hidden_sizes=hidden_sizes,
        hidden_nonlinearity=NL.tanh,
        output_nonlinearity=None,
        output_b_init=None,
        output_W_init=LI.Normal(),
    )
    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=hidden_sizes,
        mean_network=mean_network)
    # Set the weights
    if weights is not None:
        raise Exception('TODO load pickle file.')
    else:
        weights = WEIGHTS
    policy.set_param_values(weights)
    return policy
def train(env, policy, policy_init, num_episodes, episode_cap, horizon,
          **alg_args):

    # Getting the environment
    env_class = rllab_env_from_name(env)
    env = normalize(env_class())

    # Policy initialization
    if policy_init == 'zeros':
        initializer = LI.Constant(0)
    elif policy_init == 'normal':
        initializer = LI.Normal()
    else:
        raise Exception('Unrecognized policy initialization.')

    # Setting the policy type
    if policy == 'linear':
        hidden_sizes = tuple()
    elif policy == 'simple-nn':
        hidden_sizes = [16]
    else:
        raise Exception('NOT IMPLEMENTED.')

    # Creating the policy
    obs_dim = env.observation_space.flat_dim
    action_dim = env.action_space.flat_dim
    mean_network = MLP(
        input_shape=(obs_dim, ),
        output_dim=action_dim,
        hidden_sizes=hidden_sizes,
        hidden_nonlinearity=NL.tanh,
        output_nonlinearity=None,
        output_b_init=None,
        output_W_init=initializer,
    )
    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=hidden_sizes,
        mean_network=mean_network,
        log_weights=True,
    )

    # Creating baseline
    baseline = LinearFeatureBaseline(env_spec=env.spec)

    # Adding max_episodes constraint. If -1, this is unbounded
    if episode_cap:
        alg_args['max_episodes'] = num_episodes

    # Run algorithm
    algo = TRPO(env=env,
                policy=policy,
                baseline=baseline,
                batch_size=horizon * num_episodes,
                whole_paths=True,
                max_path_length=horizon,
                **alg_args)
    algo.train()
Exemplo n.º 5
0
    def __init__(self, wrapped_constraint, 
                       env_spec, 
                       yield_zeros_until=1,
                       optimizer=None, 
                       hidden_sizes=(32,), 
                       hidden_nonlinearity=NL.sigmoid, 
                       lag_time=10, 
                       coeff=1.,
                       filter_bonuses=False,
                       max_epochs=25,
                       *args, **kwargs):

        Serializable.quick_init(self,locals())

        self._wrapped_constraint = wrapped_constraint
        self._env_spec = env_spec
        self._filter_bonuses = filter_bonuses
        self._yield_zeros_until = yield_zeros_until
        self._hidden_sizes = hidden_sizes
        self._lag_time = lag_time
        self._coeff = coeff
        self._max_epochs = max_epochs
        self.use_bonus = True

        if optimizer is None:
            #optimizer = LbfgsOptimizer()
            optimizer = FirstOrderOptimizer(max_epochs=max_epochs, batch_size=None)

        self._optimizer = optimizer

        obs_dim = env_spec.observation_space.flat_dim

        predictor_network = MLP(1,hidden_sizes,hidden_nonlinearity,NL.sigmoid,
                                     input_shape=(obs_dim,))

        LasagnePowered.__init__(self, [predictor_network.output_layer])

        x_var = predictor_network.input_layer.input_var
        y_var = TT.matrix("ys")
        out_var = L.get_output(predictor_network.output_layer, 
                               {predictor_network.input_layer: x_var})

        regression_loss = TT.mean(TT.square(y_var - out_var))

        optimizer_args = dict(
            loss=regression_loss,
            target=self,
            inputs=[x_var, y_var],
        )

        self._optimizer.update_opt(**optimizer_args)
        self._f_predict = compile_function([x_var],out_var)

        self._fit_steps = 0

        self.has_baseline = self._wrapped_constraint.has_baseline
        if self.has_baseline:
            self.baseline = self._wrapped_constraint.baseline
    def __init__(
        self,
        env_spec,
        latent_dim=0,  # all this is fake
        latent_name='categorical',
        bilinear_integration=False,
        resample=False,  # until here
        hidden_sizes=(32, 32),
        hidden_nonlinearity=NL.tanh,
        prob_network=None,
    ):
        """
        :param env_spec: A spec for the mdp.
        :param hidden_sizes: list of sizes for the fully connected hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param prob_network: manually specified network for this policy, other network params
        are ignored
        :return:
        """
        #bullshit
        self.latent_dim = latent_dim  ##could I avoid needing this self for the get_action?
        self.latent_name = latent_name
        self.bilinear_integration = bilinear_integration
        self.resample = resample
        self._set_std_to_0 = False
        # self._set_std_to_0 = True

        Serializable.quick_init(self, locals())

        assert isinstance(env_spec.action_space, Discrete)

        if prob_network is None:
            prob_network = MLP(
                input_shape=(env_spec.observation_space.flat_dim, ),
                output_dim=env_spec.action_space.n,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=NL.softmax,
            )

        self._l_prob = prob_network.output_layer
        self._l_obs = prob_network.input_layer
        self._f_prob = ext.compile_function(
            [prob_network.input_layer.input_var],
            L.get_output(prob_network.output_layer))

        self._dist = Categorical(env_spec.action_space.n)
        self._layers = prob_network.layers  # Rui: added layers for function get_params()

        super(CategoricalMLPPolicy, self).__init__(env_spec)
        LasagnePowered.__init__(self, [prob_network.output_layer])
Exemplo n.º 7
0
    def _buildVFNetFromBaseline(self, blRegressor):
        #blRegressor =polDict['baseline']._regressor
        import lasagne
        import lasagne.layers as L
        import theano as T
        #import theano.tensor as T
        from rllab.core.network import MLP
        #architecture of baseline mean network
        blLayerShapes = blRegressor.get_param_shapes()
        #tuple to hold architecture
        tmpL = []
        for i in range(1, len(blLayerShapes) - 2, 2):
            tmpL.append(blLayerShapes[i][0])
        blArchTupl = tuple(tmpL)
        blNonlinearity = blRegressor._mean_network.layers[1].nonlinearity
        outNonLinearity = blRegressor._mean_network.output_layer.nonlinearity
        #print('Nonlinearity in blRegressor : {} | Output nonlinearity in blRegressor : {}'.format(blNonlinearity,outNonLinearity))
        #parameters of baseline mean network
        blParams = L.get_all_param_values(
            blRegressor._mean_network.output_layer)

        #build new network - make sure to match nonlinearity to source blregressor
        net = MLP(
            input_shape=(blLayerShapes[0][0], ),
            output_dim=1,
            hidden_sizes=blArchTupl,
            hidden_nonlinearity=blNonlinearity,  #lasagne.nonlinearities.rectify,
            output_nonlinearity=outNonLinearity,
        )
        #set net's parameters to be baseline mean network parameters
        L.set_all_param_values(net.output_layer, blParams)
        #use net's input variable
        X = net.input_layer.input_var
        #get net's output predictions
        pred = L.get_output(net.output_layer, deterministic=True)
        #build theano function mapping input to prediction (value function model)
        valueFunc = T.function([X], pred)
        #build jacobian
        vfJacob = T.gradient.jacobian(
            pred[0], X
        )  #use consider_constant=<theano var> to set constant elements (?)

        #return net and valueFunc model
        resDict = dict()
        resDict['X'] = X
        resDict['net'] = net
        resDict['valueFunc'] = valueFunc
        resDict['pred'] = pred
        resDict['vfJacob'] = vfJacob

        return resDict
    def __init__(
            self,
            name,
            env_spec,
            hidden_sizes=(32, 32),
            hidden_nonlinearity=NL.tanh,
            num_seq_inputs=1,
    ):
        """
        :param env_spec: A spec for the mdp.
        :param hidden_sizes: list of sizes for the fully connected hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param prob_network: manually specified network for this policy, other network params
        are ignored
        :return:
        """
        Serializable.quick_init(self, locals())

        assert isinstance(env_spec.action_space, Discrete)

        self._env_spec = env_spec
        
        # print( env_spec.observation_space.shape )


        q_network = MLP(
            input_shape=(env_spec.observation_space.flat_dim * num_seq_inputs,),
            output_dim=env_spec.action_space.n,
            hidden_sizes=hidden_sizes,
            hidden_nonlinearity=hidden_nonlinearity,
            output_nonlinearity=NL.linear,
            name=name
        )
        
        self._l_q = q_network.output_layer
        self._l_obs = q_network.input_layer
        self._f_q = ext.compile_function(
            [q_network.input_layer.input_var],
            L.get_output(q_network.output_layer)
        )

        self._dist = Categorical(env_spec.action_space.n)

        super(CategoricalMlpQPolicy, self).__init__(env_spec)
        LasagnePowered.__init__(self, [q_network.output_layer])
Exemplo n.º 9
0
    def __init__(
        self,
        env_spec,
        hidden_sizes=(32, 32),
        hidden_nonlinearity=NL.tanh,
        prob_network=None,
    ):
        """
        :param env_spec: A spec for the mdp.
        :param hidden_sizes: list of sizes for the fully connected hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param prob_network: manually specified network for this policy, other network params
        are ignored
        :return:
        """
        Serializable.quick_init(self, locals())

        assert isinstance(env_spec.action_space, Discrete)

        if prob_network is None:
            prob_network = MLP(
                input_shape=(env_spec.observation_space.flat_dim, ),
                output_dim=env_spec.action_space.n,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=NL.softmax,
            )

        self._l_prob = prob_network.output_layer
        self._l_obs = prob_network.input_layer
        self._f_prob = ext.compile_function(
            [prob_network.input_layer.input_var],
            L.get_output(prob_network.output_layer))

        self._dist = Categorical(env_spec.action_space.n)

        super(CategoricalMLPPolicy, self).__init__(env_spec)
        LasagnePowered.__init__(self, [prob_network.output_layer])
Exemplo n.º 10
0
    def __init__(
            self,
            disc_window,
            disc_joints_dim,
            iteration,
            a_max=0.7,
            a_min=0.0,
            batch_size = 64,
            iter_per_train = 10,
            decent_portion=0.8,
            hidden_sizes=(32, 32),
            hidden_nonlinearity=NL.tanh,
            output_nonlinearity=NL.tanh,
            disc_network=None,
    ):  
        self.batch_size=64
        self.iter_per_train=10
        self.disc_window = disc_window
        self.disc_joints_dim = disc_joints_dim
        self.disc_dim = self.disc_window*self.disc_joints_dim
        self.end_iter = int(iteration*decent_portion)
        self.iter_count = 0
        out_dim = 1
        target_var = TT.ivector('targets')

        # create network
        if disc_network is None:
            disc_network = MLP(
                input_shape=(self.disc_dim,),
                output_dim=out_dim,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=output_nonlinearity,
            )

        self._disc_network = disc_network

        disc_reward = disc_network.output_layer
        obs_var = disc_network.input_layer.input_var

        disc_var, = L.get_output([disc_reward])

        self._disc_var = disc_var

        LasagnePowered.__init__(self, [disc_reward])
        self._f_disc = ext.compile_function(
            inputs=[obs_var],
            outputs=[disc_var],
            log_name="f_discriminate_forward",
        )
        
        params = L.get_all_params(disc_network, trainable=True)
        loss = lasagne.objectives.categorical_crossentropy(disc_var, target_var).mean()
        updates = lasagne.updates.adam(loss, params, learning_rate=0.01)
        self._f_disc_train = ext.compile_function(
            inputs=[obs_var, target_var],
            outputs=[loss],
            updates=updates,
            log_name="f_discriminate_train"
        )

        self.data = self.load_data()
        self.a = np.linspace(a_min, a_max, self.end_iter)
def main():
    now = datetime.datetime.now(dateutil.tz.tzlocal())
    rand_id = str(uuid.uuid4())[:5]
    timestamp = now.strftime('%Y_%m_%d_%H_%M_%S_%f_%Z')
    default_exp_name = 'experiment_%s_%s' % (timestamp, rand_id)

    parser = argparse.ArgumentParser()
    parser.add_argument('--exp_name',
                        type=str,
                        default=default_exp_name,
                        help='Name of the experiment.')

    parser.add_argument('--discount', type=float, default=0.99)
    parser.add_argument('--gae_lambda', type=float, default=1.0)
    parser.add_argument('--reward_scale', type=float, default=1.0)

    parser.add_argument('--n_iter', type=int, default=250)
    parser.add_argument('--sampler_workers', type=int, default=1)
    parser.add_argument('--max_traj_len', type=int, default=250)
    parser.add_argument('--update_curriculum',
                        action='store_true',
                        default=False)
    parser.add_argument('--n_timesteps', type=int, default=8000)
    parser.add_argument('--control', type=str, default='centralized')

    parser.add_argument('--rectangle', type=str, default='10,10')
    parser.add_argument('--map_type', type=str, default='rectangle')
    parser.add_argument('--n_evaders', type=int, default=5)
    parser.add_argument('--n_pursuers', type=int, default=2)
    parser.add_argument('--obs_range', type=int, default=3)
    parser.add_argument('--n_catch', type=int, default=2)
    parser.add_argument('--urgency', type=float, default=0.0)
    parser.add_argument('--pursuit', dest='train_pursuit', action='store_true')
    parser.add_argument('--evade', dest='train_pursuit', action='store_false')
    parser.set_defaults(train_pursuit=True)
    parser.add_argument('--surround', action='store_true', default=False)
    parser.add_argument('--constraint_window', type=float, default=1.0)
    parser.add_argument('--sample_maps', action='store_true', default=False)
    parser.add_argument('--map_file', type=str, default='../maps/map_pool.npy')
    parser.add_argument('--flatten', action='store_true', default=False)
    parser.add_argument('--reward_mech', type=str, default='global')
    parser.add_argument('--catchr', type=float, default=0.1)
    parser.add_argument('--term_pursuit', type=float, default=5.0)

    parser.add_argument('--recurrent', type=str, default=None)
    parser.add_argument('--policy_hidden_sizes', type=str, default='128,128')
    parser.add_argument('--baselin_hidden_sizes', type=str, default='128,128')
    parser.add_argument('--baseline_type', type=str, default='linear')

    parser.add_argument('--conv', action='store_true', default=False)

    parser.add_argument('--max_kl', type=float, default=0.01)

    parser.add_argument('--log_dir', type=str, required=False)
    parser.add_argument('--tabular_log_file',
                        type=str,
                        default='progress.csv',
                        help='Name of the tabular log file (in csv).')
    parser.add_argument('--text_log_file',
                        type=str,
                        default='debug.log',
                        help='Name of the text log file (in pure text).')
    parser.add_argument('--params_log_file',
                        type=str,
                        default='params.json',
                        help='Name of the parameter log file (in json).')
    parser.add_argument('--seed', type=int, help='Random seed for numpy')
    parser.add_argument('--args_data',
                        type=str,
                        help='Pickled data for stub objects')
    parser.add_argument('--snapshot_mode',
                        type=str,
                        default='all',
                        help='Mode to save the snapshot. Can be either "all" '
                        '(all iterations will be saved), "last" (only '
                        'the last iteration will be saved), or "none" '
                        '(do not save snapshots)')
    parser.add_argument(
        '--log_tabular_only',
        type=ast.literal_eval,
        default=False,
        help=
        'Whether to only print the tabular log information (in a horizontal format)'
    )

    args = parser.parse_args()

    parallel_sampler.initialize(n_parallel=args.sampler_workers)

    if args.seed is not None:
        set_seed(args.seed)
        parallel_sampler.set_seed(args.seed)

    args.hidden_sizes = tuple(map(int, args.policy_hidden_sizes.split(',')))

    if args.sample_maps:
        map_pool = np.load(args.map_file)
    else:
        if args.map_type == 'rectangle':
            env_map = TwoDMaps.rectangle_map(
                *map(int, args.rectangle.split(',')))
        elif args.map_type == 'complex':
            env_map = TwoDMaps.complex_map(
                *map(int, args.rectangle.split(',')))
        else:
            raise NotImplementedError()
        map_pool = [env_map]

    env = PursuitEvade(map_pool,
                       n_evaders=args.n_evaders,
                       n_pursuers=args.n_pursuers,
                       obs_range=args.obs_range,
                       n_catch=args.n_catch,
                       train_pursuit=args.train_pursuit,
                       urgency_reward=args.urgency,
                       surround=args.surround,
                       sample_maps=args.sample_maps,
                       constraint_window=args.constraint_window,
                       flatten=args.flatten,
                       reward_mech=args.reward_mech,
                       catchr=args.catchr,
                       term_pursuit=args.term_pursuit)

    env = RLLabEnv(StandardizedEnv(env,
                                   scale_reward=args.reward_scale,
                                   enable_obsnorm=False),
                   mode=args.control)

    if args.recurrent:
        if args.conv:
            feature_network = ConvNetwork(
                input_shape=emv.spec.observation_space.shape,
                output_dim=5,
                conv_filters=(8, 16, 16),
                conv_filter_sizes=(3, 3, 3),
                conv_strides=(1, 1, 1),
                conv_pads=('VALID', 'VALID', 'VALID'),
                hidden_sizes=(64, ),
                hidden_nonlinearity=NL.rectify,
                output_nonlinearity=NL.softmax)
        else:
            feature_network = MLP(
                input_shape=(env.spec.observation_space.flat_dim +
                             env.spec.action_space.flat_dim, ),
                output_dim=5,
                hidden_sizes=(128, 128, 128),
                hidden_nonlinearity=NL.tanh,
                output_nonlinearity=None)
        if args.recurrent == 'gru':
            policy = CategoricalGRUPolicy(env_spec=env.spec,
                                          feature_network=feature_network,
                                          hidden_dim=int(
                                              args.policy_hidden_sizes))
    elif args.conv:
        feature_network = ConvNetwork(
            input_shape=env.spec.observation_space.shape,
            output_dim=5,
            conv_filters=(8, 16, 16),
            conv_filter_sizes=(3, 3, 3),
            conv_strides=(1, 1, 1),
            conv_pads=('valid', 'valid', 'valid'),
            hidden_sizes=(64, ),
            hidden_nonlinearity=NL.rectify,
            output_nonlinearity=NL.softmax)
        policy = CategoricalMLPPolicy(env_spec=env.spec,
                                      prob_network=feature_network)
    else:
        policy = CategoricalMLPPolicy(env_spec=env.spec,
                                      hidden_sizes=args.hidden_sizes)

    if args.baseline_type == 'linear':
        baseline = LinearFeatureBaseline(env_spec=env.spec)
    else:
        baseline = ZeroBaseline(obsfeat_space)

    # logger
    default_log_dir = config.LOG_DIR
    if args.log_dir is None:
        log_dir = osp.join(default_log_dir, args.exp_name)
    else:
        log_dir = args.log_dir
    tabular_log_file = osp.join(log_dir, args.tabular_log_file)
    text_log_file = osp.join(log_dir, args.text_log_file)
    params_log_file = osp.join(log_dir, args.params_log_file)

    logger.log_parameters_lite(params_log_file, args)
    logger.add_text_output(text_log_file)
    logger.add_tabular_output(tabular_log_file)
    prev_snapshot_dir = logger.get_snapshot_dir()
    prev_mode = logger.get_snapshot_mode()
    logger.set_snapshot_dir(log_dir)
    logger.set_snapshot_mode(args.snapshot_mode)
    logger.set_log_tabular_only(args.log_tabular_only)
    logger.push_prefix("[%s] " % args.exp_name)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=args.n_timesteps,
        max_path_length=args.max_traj_len,
        n_itr=args.n_iter,
        discount=args.discount,
        gae_lambda=args.gae_lambda,
        step_size=args.max_kl,
        mode=args.control,
    )

    algo.train()
Exemplo n.º 12
0
    def __init__(
        self,
        env_spec,
        hidden_sizes=(32, 32),
        learn_std=True,
        init_std=1.0,
        adaptive_std=False,
        std_share_network=False,
        std_hidden_sizes=(32, 32),
        min_std=1e-6,
        std_hidden_nonlinearity=NL.tanh,
        hidden_nonlinearity=NL.tanh,
        output_nonlinearity=None,
        mean_network=None,
        std_network=None,
        split_masks=None,
        dist_cls=DiagonalGaussian,
        mp_dim=0,
        mp_sel_hid_dim=0,
        mp_sel_num=0,
        mp_projection_dim=2,
        net_mode=0,  # 0: vanilla, 1: append mp to second layer, 2: project mp to lower space, 3: mp selection blending, 4: mp selection discrete
        split_init_net=None,
        split_units=None,
        wc_net_path=None,
        learn_segment=False,
        split_num=1,
        split_layer=[0],
        split_std=False,
        task_id=0,
    ):
        """
        :param env_spec:
        :param hidden_sizes: list of sizes for the fully-connected hidden layers
        :param learn_std: Is std trainable
        :param init_std: Initial std
        :param adaptive_std:
        :param std_share_network:
        :param std_hidden_sizes: list of sizes for the fully-connected layers for std
        :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues
        :param std_hidden_nonlinearity:
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param output_nonlinearity: nonlinearity for the output layer
        :param mean_network: custom network for the output mean
        :param std_network: custom network for the output log std
        :return:
        """
        Serializable.quick_init(self, locals())
        assert isinstance(env_spec.action_space, Box)

        obs_dim = env_spec.observation_space.flat_dim
        action_dim = env_spec.action_space.flat_dim

        # create network
        if mean_network is None:
            if net_mode == 1:
                mean_network = MLPAppend(
                    input_shape=(obs_dim, ),
                    output_dim=action_dim,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=output_nonlinearity,
                    append_dim=mp_dim,
                )
            elif net_mode == 2:
                mean_network = MLP_PROJ(
                    input_shape=(obs_dim, ),
                    output_dim=action_dim,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=output_nonlinearity,
                    mp_dim=mp_dim,
                    mp_hid_dim=16,
                    mp_proj_dim=mp_projection_dim,
                )
            elif net_mode == 3:
                mean_network = MLP_PS(
                    input_shape=(obs_dim, ),
                    output_dim=action_dim,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=output_nonlinearity,
                    mp_dim=mp_dim,
                    mp_sel_hid_dim=mp_sel_hid_dim,
                    mp_sel_num=mp_sel_num,
                )
            elif net_mode == 4:
                wc_net = joblib.load(wc_net_path)
                mean_network = MLP_PSD(
                    input_shape=(obs_dim, ),
                    output_dim=action_dim,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=output_nonlinearity,
                    mp_dim=mp_dim,
                    mp_sel_hid_dim=mp_sel_hid_dim,
                    mp_sel_num=mp_sel_num,
                    wc_net=wc_net,
                    learn_segment=learn_segment,
                )
            elif net_mode == 5:
                mean_network = MLP_Split(
                    input_shape=(obs_dim, ),
                    output_dim=action_dim,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=output_nonlinearity,
                    split_layer=split_layer,
                    split_num=split_num,
                )
            elif net_mode == 6:
                mean_network = MLP_SplitAct(
                    input_shape=(obs_dim, ),
                    output_dim=action_dim,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=output_nonlinearity,
                    split_num=split_num,
                    split_units=split_units,
                    init_net=split_init_net._mean_network,
                )
            elif net_mode == 7:
                mean_network = MLP_SoftSplit(
                    input_shape=(obs_dim, ),
                    output_dim=action_dim,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=output_nonlinearity,
                    split_num=split_num,
                    init_net=split_init_net._mean_network,
                )
            elif net_mode == 8:
                mean_network = MLP_MaskedSplit(
                    input_shape=(obs_dim, ),
                    output_dim=action_dim,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=output_nonlinearity,
                    split_num=split_num,
                    split_masks=split_masks,
                    init_net=split_init_net._mean_network,
                )
            elif net_mode == 9:
                mean_network = MLP_MaskedSplitCont(
                    input_shape=(obs_dim, ),
                    output_dim=action_dim,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=output_nonlinearity,
                    task_id=task_id,
                    init_net=split_init_net._mean_network,
                )
            else:
                mean_network = MLP(
                    input_shape=(obs_dim, ),
                    output_dim=action_dim,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=output_nonlinearity,
                )
        self._mean_network = mean_network

        l_mean = mean_network.output_layer
        obs_var = mean_network.input_layer.input_var

        if std_network is not None:
            l_log_std = std_network.output_layer
        else:
            if adaptive_std:
                std_network = MLP(
                    input_shape=(obs_dim, ),
                    input_layer=mean_network.input_layer,
                    output_dim=action_dim,
                    hidden_sizes=std_hidden_sizes,
                    hidden_nonlinearity=std_hidden_nonlinearity,
                    output_nonlinearity=None,
                )
                l_log_std = std_network.output_layer
            else:
                if net_mode != 8 or not split_std:
                    l_log_std = ParamLayer(
                        mean_network.input_layer,
                        num_units=action_dim,
                        param=lasagne.init.Constant(np.log(init_std)),
                        name="output_log_std",
                        trainable=learn_std,
                    )
                else:
                    l_log_std = ParamLayerSplit(
                        mean_network.input_layer,
                        num_units=action_dim,
                        param=lasagne.init.Constant(np.log(init_std)),
                        name="output_log_std",
                        trainable=learn_std,
                        split_num=split_num,
                        init_param=split_init_net.get_params()[-1])
                if net_mode == 6 or net_mode == 7 or (net_mode == 8
                                                      and not split_std):
                    l_log_std.get_params()[0].set_value(
                        split_init_net.get_params()[-1].get_value())
                if net_mode == 9:
                    l_log_std.get_params()[0].set_value(
                        split_init_net.get_params()[-1].get_value() + 0.5)

        self.min_std = min_std

        mean_var, log_std_var = L.get_output([l_mean, l_log_std])

        if self.min_std is not None:
            log_std_var = TT.maximum(log_std_var, np.log(min_std))

        self._mean_var, self._log_std_var = mean_var, log_std_var

        self._l_mean = l_mean
        self._l_log_std = l_log_std
        self._dist = dist_cls(action_dim)

        LasagnePowered.__init__(self, [l_mean, l_log_std])
        super(GaussianMLPPolicy, self).__init__(env_spec)

        self._f_dist = ext.compile_function(
            inputs=[obs_var],
            outputs=[mean_var, log_std_var],
        )

        if net_mode == 3 or net_mode == 4:
            self._f_blendweight = ext.compile_function(
                inputs=[obs_var], outputs=[self._mean_network._blend_weights])
            entropy = -TT.mean(self._mean_network._blend_weights *
                               TT.log(self._mean_network._blend_weights))
            self._f_weightentropy = ext.compile_function(inputs=[obs_var],
                                                         outputs=[entropy])
            avg_weights = TT.mean(self._mean_network._blend_weights, axis=0)
            entropy2 = -TT.mean(avg_weights * TT.log(avg_weights))
            self._f_choiceentropy = ext.compile_function(inputs=[obs_var],
                                                         outputs=[entropy2])
Exemplo n.º 13
0
def run_task(vv, log_dir=None, exp_name=None):
    global policy
    global baseline

    trpo_stepsize = 0.01
    trpo_subsample_factor = 0.2

    # Check if variant is available
    if vv['model_type'] not in ['BrushTireModel', 'LinearTireModel']:
        raise ValueError('Unrecognized model type for simulating robot')
    if vv['robot_type'] not in ['MRZR', 'RCCar']:
        raise ValueError('Unrecognized robot type')

    # Load environment
    if not vv['use_ros']:
        env = CircleEnv(target_velocity=vv['target_velocity'],
                        radius=vv['radius'],
                        dt=vv['dt'],
                        model_type=vv['model_type'],
                        robot_type=vv['robot_type'])
    else:
        from aa_simulation.envs.circle.circle_env_ros import CircleEnvROS
        env = CircleEnvROS(target_velocity=vv['target_velocity'],
                           radius=vv['radius'],
                           dt=vv['dt'],
                           model_type=vv['model_type'],
                           robot_type=vv['robot_type'])

    # Save variant information for comparison plots
    variant_file = logger.get_snapshot_dir() + '/variant.json'
    logger.log_variant(variant_file, vv)

    # Set variance for each action component separately for exploration
    # Note: We set the variance manually because we are not scaling our
    #       action space during training.
    init_std_speed = vv['target_velocity'] / 4
    init_std_steer = np.pi / 6
    init_std = [init_std_speed, init_std_steer]

    # Build policy and baseline networks
    # Note: Mean of policy network set to analytically computed values for
    #       faster training (rough estimates for RL to fine-tune).
    if policy is None or baseline is None:
        wheelbase = 0.257
        target_velocity = vv['target_velocity']
        target_steering = np.arctan(wheelbase / vv['radius'])  # CCW
        output_mean = np.array([target_velocity, target_steering])
        hidden_sizes = (32, 32)

        # In mean network, allow output b values to dominate final output
        # value by constraining the magnitude of the output W matrix. This is
        # to allow faster learning. These numbers are arbitrarily chosen.
        W_gain = min(vv['target_velocity'] / 5, np.pi / 15)

        mean_network = MLP(input_shape=(env.spec.observation_space.flat_dim, ),
                           output_dim=env.spec.action_space.flat_dim,
                           hidden_sizes=hidden_sizes,
                           hidden_nonlinearity=LN.tanh,
                           output_nonlinearity=None,
                           output_W_init=LI.GlorotUniform(gain=W_gain),
                           output_b_init=output_mean)
        policy = GaussianMLPPolicy(env_spec=env.spec,
                                   hidden_sizes=hidden_sizes,
                                   init_std=init_std,
                                   mean_network=mean_network)
        baseline = LinearFeatureBaseline(env_spec=env.spec,
                                         target_key='returns')

    # Reset variance to re-enable exploration when using pre-trained networks
    else:
        policy._l_log_std = ParamLayer(
            policy._mean_network.input_layer,
            num_units=env.spec.action_space.flat_dim,
            param=LI.Constant(np.log(init_std)),
            name='output_log_std',
            trainable=True)
        obs_var = policy._mean_network.input_layer.input_var
        mean_var, log_std_var = L.get_output(
            [policy._l_mean, policy._l_log_std])
        policy._log_std_var = log_std_var
        LasagnePowered.__init__(policy, [policy._l_mean, policy._l_log_std])
        policy._f_dist = ext.compile_function(inputs=[obs_var],
                                              outputs=[mean_var, log_std_var])

    safety_baseline = LinearFeatureBaseline(env_spec=env.spec,
                                            target_key='safety_returns')

    safety_constraint = CircleSafetyConstraint(max_value=1.0,
                                               eps=vv['eps'],
                                               baseline=safety_baseline)

    if vv['algo'] == 'TRPO':
        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=600,
            max_path_length=env.horizon,
            n_itr=600,
            discount=0.99,
            step_size=trpo_stepsize,
            plot=False,
        )
    else:
        algo = CPO(env=env,
                   policy=policy,
                   baseline=baseline,
                   safety_constraint=safety_constraint,
                   batch_size=600,
                   max_path_length=env.horizon,
                   n_itr=600,
                   discount=0.99,
                   step_size=trpo_stepsize,
                   gae_lambda=0.95,
                   safety_gae_lambda=1,
                   optimizer_args={'subsample_factor': trpo_subsample_factor},
                   plot=False)
    algo.train()
Exemplo n.º 14
0
    average_metric_list = []

    for testit in range(test_num):
        print('======== Start Test ', testit, ' ========')

        seed = testit * 3 + 1
        np.random.seed(seed)

        tasks = sample_tasks(dim, difficulties)
        print(tasks)

        network = MLP(
            input_shape=(in_dim, ),
            output_dim=out_dim,
            hidden_sizes=hidden_size,
            hidden_nonlinearity=NL.tanh,
            output_nonlinearity=None,
        )

        out_var = TT.matrix('out_var')
        prediction = network._output
        loss = lasagne.objectives.squared_error(prediction, out_var)
        loss = loss.mean()
        params = network.get_params(trainable=True)
        updates = lasagne.updates.sgd(loss, params, learning_rate=0.002)
        train_fn = T.function([network.input_layer.input_var, out_var],
                              loss,
                              updates=updates,
                              allow_input_downcast=True)
        ls = TT.mean((prediction - out_var)**2)
Exemplo n.º 15
0
    def __init__(
        self,
        env_spec,
        hidden_sizes=(32, 32),
        learn_std=True,
        init_std=1.0,
        adaptive_std=False,
        std_share_network=False,
        std_hidden_sizes=(32, 32),
        min_std=1e-6,
        std_hidden_nonlinearity=NL.tanh,
        hidden_nonlinearity=NL.tanh,
        output_nonlinearity=None,
        mean_network=None,
        std_network=None,
        dist_cls=DiagonalGaussian,
        aux_pred_step=3,
        aux_pred_dim=4,
        skip_last=-1,
        copy_output=False,
    ):
        """
        :param env_spec:
        :param hidden_sizes: list of sizes for the fully-connected hidden layers
        :param learn_std: Is std trainable
        :param init_std: Initial std
        :param adaptive_std:
        :param std_share_network:
        :param std_hidden_sizes: list of sizes for the fully-connected layers for std
        :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues
        :param std_hidden_nonlinearity:
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param output_nonlinearity: nonlinearity for the output layer
        :param mean_network: custom network for the output mean
        :param std_network: custom network for the output log std
        :return:
        """
        Serializable.quick_init(self, locals())
        assert isinstance(env_spec.action_space, Box)

        obs_dim = env_spec.observation_space.flat_dim
        action_dim = env_spec.action_space.flat_dim

        # create network
        if mean_network is None:
            mean_network = MLP(
                input_shape=(obs_dim, ),
                output_dim=action_dim,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=output_nonlinearity,
            )
        self._mean_network = mean_network

        self._aux_pred_network = MLPAux(
            aux_pred_step,
            aux_pred_dim,
            None,
            mean_network,
            skip_last=skip_last,
            copy_output=copy_output,
        )

        # compile training function
        aux_target_var = TT.matrix('aux_targets')
        prediction = self._aux_pred_network._output
        loss = lasagne.objectives.squared_error(prediction, aux_target_var)
        loss = loss.mean()
        params = self._aux_pred_network.get_params(trainable=True)
        updates = lasagne.updates.adam(loss, params, learning_rate=0.001)
        self.aux_train_fn = T.function(
            [self._aux_pred_network.input_layer.input_var, aux_target_var],
            loss,
            updates=updates)
        self.aux_loss = T.function(
            [self._aux_pred_network.input_layer.input_var, aux_target_var],
            loss)

        l_mean = mean_network.output_layer
        obs_var = mean_network.input_layer.input_var

        if std_network is not None:
            l_log_std = std_network.output_layer
        else:
            if adaptive_std:
                std_network = MLP(
                    input_shape=(obs_dim, ),
                    input_layer=mean_network.input_layer,
                    output_dim=action_dim,
                    hidden_sizes=std_hidden_sizes,
                    hidden_nonlinearity=std_hidden_nonlinearity,
                    output_nonlinearity=None,
                )
                l_log_std = std_network.output_layer
            else:
                l_log_std = ParamLayer(
                    mean_network.input_layer,
                    num_units=action_dim,
                    param=lasagne.init.Constant(np.log(init_std)),
                    name="output_log_std",
                    trainable=learn_std,
                )

        self.min_std = min_std

        mean_var, log_std_var, aux_pred_var = L.get_output(
            [l_mean, l_log_std, self._aux_pred_network.output_layer])

        if self.min_std is not None:
            log_std_var = TT.maximum(log_std_var, np.log(min_std))

        self._mean_var, self._log_std_var, self._aux_pred_var = mean_var, log_std_var, aux_pred_var

        self._l_mean = l_mean
        self._l_log_std = l_log_std

        self._dist = dist_cls(action_dim)

        LasagnePowered.__init__(
            self, [l_mean, l_log_std, self._aux_pred_network.output_layer])
        super(GaussianMLPAuxPolicy, self).__init__(env_spec)

        self._f_dist = ext.compile_function(
            inputs=[obs_var],
            outputs=[mean_var, log_std_var],
        )

        self._f_auxpred = ext.compile_function(
            inputs=[self._aux_pred_network.input_layer.input_var],
            outputs=[prediction],
        )
Exemplo n.º 16
0
    average_metric_list = []

    for testit in range(test_num):
        print('======== Start Test ', testit, ' ========')
        seed = testit * 3 + 1

        np.random.seed(seed)

        tasks = sample_tasks(dim, difficulties)
        print(tasks)

        network = MLP(
            input_shape=(in_dim, ),
            output_dim=out_dim,
            hidden_sizes=hidden_size,
            hidden_nonlinearity=NL.tanh,
            output_nonlinearity=None,
        )

        out_var = TT.matrix('out_var')
        prediction = network._output
        loss = lasagne.objectives.squared_error(prediction, out_var)
        loss = loss.mean()
        params = network.get_params(trainable=True)
        updates = lasagne.updates.adam(loss, params, learning_rate=0.001)
        train_fn = T.function([network.input_layer.input_var, out_var],
                              loss,
                              updates=updates,
                              allow_input_downcast=True)
        ls = TT.mean((prediction - out_var)**2)
Exemplo n.º 17
0
    def __init__(
        self,
        env_spec,
        zero_gradient_cutoff,
        hidden_sizes=(32, 32),
        learn_std=True,
        init_std=1.0,
        adaptive_std=False,
        std_share_network=False,
        std_hidden_sizes=(32, 32),
        min_std=1e-6,
        std_hidden_nonlinearity=NL.tanh,
        hidden_nonlinearity=NL.tanh,
        output_nonlinearity=None,
        mean_network=None,
        std_network=None,
        dist_cls=DiagonalGaussian,
        adversarial=True,
        eps=0.1,
        probability=0.0,
        use_dynamics=False,
        random=False,
        observable_noise=False,
        use_max_norm=True,
        record_traj=False,
        set_dynamics=None,
        mask_augmentation=False,
    ):
        """
        :param env_spec:
        :param hidden_sizes: list of sizes for the fully-connected hidden layers
        :param learn_std: Is std trainable
        :param init_std: Initial std
        :param adaptive_std:
        :param std_share_network:
        :param std_hidden_sizes: list of sizes for the fully-connected layers for std
        :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues
        :param std_hidden_nonlinearity:
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param output_nonlinearity: nonlinearity for the output layer
        :param mean_network: custom network for the output mean
        :param std_network: custom network for the output log std
        :param dist_cls: defines probability distribution over actions

        The following parameters are specific to the AdversarialPolicy Class.

        :param adversarial: whether the policy should incorporate adversarial states during rollout
        :param eps: the strength of the adversarial perturbation
        :param probability: frequency of adversarial updates. If 0, do exactly one update at the beginning of
                            every episode
        :param use_dynamics: if True, generate adversarial dynamics updates, otherwise do adversarial state updates
        :param random: if True, use a random perturbation instead of an adversarial perturbation
        :param observable_noise: if True, don't set adversarial state in the environment, treat it as noise
                                 on observation
        :param zero_gradient_cutoff: determines cutoff index for zero-ing out gradients - this is useful when doing
                                     adversarial dynamics vs. adversarial states, when we only want to compute
                                     gradients for one section of the augmented state vector. We also use this to
                                     determine what the original, non-augmented state size is.
        :param use_max_norm: if True, use Fast Gradient Sign Method (FGSM) to generate adversarial perturbations, else
                             use full gradient ascent
        :param record_traj: if True, rollout dictionaries will contain qpos and qvel trajectories. This is useful for
                            plotting trajectories.
        :param set_dynamics: if provided, the next rollout initializes the environment to the passed dynamics.
        :param mask_augmentation: if True, don't augment the state (even though the environment augments the state with 
                                  the dynamics parameters, the policy will ignore these dimensions)

        :return:
        """
        Serializable.quick_init(self, locals())
        assert isinstance(env_spec.action_space, Box)

        obs_dim = env_spec.observation_space.flat_dim
        action_dim = env_spec.action_space.flat_dim

        # TODO: make a more elegant solution to this
        # This is here because we assume the original, unaugmented state size is provided.
        assert (zero_gradient_cutoff is not None)

        # if we're ignoring state augmentation, modify observation size / network size accordingly
        if mask_augmentation:
            obs_dim = zero_gradient_cutoff

        # create network
        if mean_network is None:
            mean_network = MLP(
                input_shape=(obs_dim, ),
                output_dim=action_dim,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=output_nonlinearity,
            )
        self._mean_network = mean_network

        l_mean = mean_network.output_layer
        obs_var = mean_network.input_layer.input_var

        if std_network is not None:
            l_log_std = std_network.output_layer
        else:
            if adaptive_std:
                std_network = MLP(
                    input_shape=(obs_dim, ),
                    input_layer=mean_network.input_layer,
                    output_dim=action_dim,
                    hidden_sizes=std_hidden_sizes,
                    hidden_nonlinearity=std_hidden_nonlinearity,
                    output_nonlinearity=None,
                )
                l_log_std = std_network.output_layer
            else:
                l_log_std = ParamLayer(
                    mean_network.input_layer,
                    num_units=action_dim,
                    param=lasagne.init.Constant(np.log(init_std)),
                    name="output_log_std",
                    trainable=learn_std,
                )

        self.min_std = min_std

        mean_var, log_std_var = L.get_output([l_mean, l_log_std])

        if self.min_std is not None:
            log_std_var = TT.maximum(log_std_var, np.log(min_std))

        self._mean_var, self._log_std_var = mean_var, log_std_var

        self._l_mean = l_mean
        self._l_log_std = l_log_std

        # take exponential for the actual standard dev
        self._tru_std_var = TT.exp(self._log_std_var)

        # take gradients of mean network, exponential of std network wrt L2 norm
        self._mean_grad = theano.grad(self._mean_var.norm(2), obs_var)
        self._std_grad = theano.grad(self._tru_std_var.norm(2),
                                     obs_var,
                                     disconnected_inputs='warn')

        self._dist = dist_cls(action_dim)

        LasagnePowered.__init__(self, [l_mean, l_log_std])
        super(AdversarialPolicy, self).__init__(env_spec)

        self._f_dist = ext.compile_function(
            inputs=[obs_var],
            outputs=[mean_var, log_std_var],
        )

        # function to get gradients
        self._f_grad_dist = ext.compile_function(
            inputs=[obs_var], outputs=[self._mean_grad, self._std_grad])

        # initialize adversarial parameters
        self.adversarial = adversarial
        self.eps = eps
        self.probability = probability
        self.use_dynamics = use_dynamics
        self.random = random
        self.observable_noise = observable_noise
        self.zero_gradient_cutoff = zero_gradient_cutoff
        self.use_max_norm = use_max_norm
        self.record_traj = record_traj
        self.set_dynamics = set_dynamics
        self.mask_augmentation = mask_augmentation
Exemplo n.º 18
0
    def __init__(
        self,
        env_spec,
        latent_dim=2,
        latent_name='bernoulli',
        bilinear_integration=False,
        resample=False,
        hidden_sizes=(32, 32),
        learn_std=True,
        init_std=1.0,
        adaptive_std=False,
        std_share_network=False,
        std_hidden_sizes=(32, 32),
        std_hidden_nonlinearity=NL.tanh,
        hidden_nonlinearity=NL.tanh,
        output_nonlinearity=None,
        min_std=1e-4,
    ):
        """
        :param latent_dim: dimension of the latent variables
        :param latent_name: distribution of the latent variables
        :param bilinear_integration: Boolean indicator of bilinear integration or simple concatenation
        :param resample: Boolean indicator of resampling at every step or only at the start of the rollout (or whenever
        agent is reset, which can happen several times along the rollout with rollout in utils_snn)
        """
        self.latent_dim = latent_dim  ##could I avoid needing this self for the get_action?
        self.latent_name = latent_name
        self.bilinear_integration = bilinear_integration
        self.resample = resample
        self.min_std = min_std
        self.hidden_sizes = hidden_sizes

        self.pre_fix_latent = np.array(
            []
        )  # if this is not empty when using reset() it will use this latent
        self.latent_fix = np.array(
            [])  # this will hold the latents variable sampled in reset()
        self._set_std_to_0 = False

        if latent_name == 'normal':
            self.latent_dist = DiagonalGaussian(self.latent_dim)
            self.latent_dist_info = dict(mean=np.zeros(self.latent_dim),
                                         log_std=np.zeros(self.latent_dim))
        elif latent_name == 'bernoulli':
            self.latent_dist = Bernoulli(self.latent_dim)
            self.latent_dist_info = dict(p=0.5 * np.ones(self.latent_dim))
        elif latent_name == 'categorical':
            self.latent_dist = Categorical(self.latent_dim)
            if self.latent_dim > 0:
                self.latent_dist_info = dict(prob=1. / self.latent_dim *
                                             np.ones(self.latent_dim))
            else:
                self.latent_dist_info = dict(prob=np.ones(self.latent_dim))
        else:
            raise NotImplementedError

        Serializable.quick_init(self, locals())
        assert isinstance(env_spec.action_space, Box)

        if self.bilinear_integration:
            obs_dim = env_spec.observation_space.flat_dim + latent_dim +\
                      env_spec.observation_space.flat_dim * latent_dim
        else:
            obs_dim = env_spec.observation_space.flat_dim + latent_dim  # here only if concat.

        action_dim = env_spec.action_space.flat_dim

        mean_network = MLP(
            input_shape=(obs_dim, ),
            output_dim=action_dim,
            hidden_sizes=hidden_sizes,
            hidden_nonlinearity=hidden_nonlinearity,
            output_nonlinearity=output_nonlinearity,
            name="meanMLP",
        )

        l_mean = mean_network.output_layer
        obs_var = mean_network.input_layer.input_var

        if adaptive_std:
            l_log_std = MLP(input_shape=(obs_dim, ),
                            input_var=obs_var,
                            output_dim=action_dim,
                            hidden_sizes=std_hidden_sizes,
                            hidden_nonlinearity=std_hidden_nonlinearity,
                            output_nonlinearity=None,
                            name="log_stdMLP").output_layer
        else:
            l_log_std = ParamLayer(
                mean_network.input_layer,
                num_units=action_dim,
                param=lasagne.init.Constant(np.log(init_std)),
                name="output_log_std",
                trainable=learn_std,
            )

        mean_var, log_std_var = L.get_output([l_mean, l_log_std])

        if self.min_std is not None:
            log_std_var = TT.maximum(log_std_var, np.log(self.min_std))

        self._l_mean = l_mean
        self._l_log_std = l_log_std

        self._dist = DiagonalGaussian(action_dim)

        LasagnePowered.__init__(self, [l_mean, l_log_std])
        super(GaussianMLPPolicy_snn, self).__init__(env_spec)

        self._f_dist = ext.compile_function(
            inputs=[obs_var],
            outputs=[mean_var, log_std_var],
        )
Exemplo n.º 19
0
    def __init__(
        self,
        input_shape,
        output_dim,
        mean_network=None,
        hidden_sizes=(32, 32),
        hidden_nonlinearity=NL.rectify,
        optimizer=None,
        use_trust_region=True,
        step_size=0.01,
        learn_std=True,
        init_std=1.0,
        adaptive_std=False,
        std_share_network=False,
        std_hidden_sizes=(32, 32),
        std_nonlinearity=None,
        normalize_inputs=True,
        normalize_outputs=True,
        name=None,
    ):
        """
        :param input_shape: Shape of the input data.
        :param output_dim: Dimension of output.
        :param hidden_sizes: Number of hidden units of each layer of the mean network.
        :param hidden_nonlinearity: Non-linearity used for each layer of the mean network.
        :param optimizer: Optimizer for minimizing the negative log-likelihood.
        :param use_trust_region: Whether to use trust region constraint.
        :param step_size: KL divergence constraint for each iteration
        :param learn_std: Whether to learn the standard deviations. Only effective if adaptive_std is False. If
        adaptive_std is True, this parameter is ignored, and the weights for the std network are always learned.
        :param adaptive_std: Whether to make the std a function of the states.
        :param std_share_network: Whether to use the same network as the mean.
        :param std_hidden_sizes: Number of hidden units of each layer of the std network. Only used if
        `std_share_network` is False. It defaults to the same architecture as the mean.
        :param std_nonlinearity: Non-linearity used for each layer of the std network. Only used if `std_share_network`
        is False. It defaults to the same non-linearity as the mean.
        """
        Serializable.quick_init(self, locals())

        if optimizer is None:
            if use_trust_region:
                optimizer = PenaltyLbfgsOptimizer()
            else:
                optimizer = LbfgsOptimizer()

        self._optimizer = optimizer

        if mean_network is None:
            mean_network = MLP(
                input_shape=input_shape,
                output_dim=output_dim,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=None,
            )

        l_mean = mean_network.output_layer

        if adaptive_std:
            l_log_std = MLP(
                input_shape=input_shape,
                input_var=mean_network.input_layer.input_var,
                output_dim=output_dim,
                hidden_sizes=std_hidden_sizes,
                hidden_nonlinearity=std_nonlinearity,
                output_nonlinearity=None,
            ).output_layer
        else:
            l_log_std = ParamLayer(
                mean_network.input_layer,
                num_units=output_dim,
                param=lasagne.init.Constant(np.log(init_std)),
                name="output_log_std",
                trainable=learn_std,
            )

        LasagnePowered.__init__(self, [l_mean, l_log_std])

        xs_var = mean_network.input_layer.input_var
        ys_var = TT.matrix("ys")
        old_means_var = TT.matrix("old_means")
        old_log_stds_var = TT.matrix("old_log_stds")

        x_mean_var = theano.shared(np.zeros((1, ) + input_shape),
                                   name="x_mean",
                                   broadcastable=(True, ) +
                                   (False, ) * len(input_shape))
        x_std_var = theano.shared(np.ones((1, ) + input_shape),
                                  name="x_std",
                                  broadcastable=(True, ) +
                                  (False, ) * len(input_shape))
        y_mean_var = theano.shared(np.zeros((1, output_dim)),
                                   name="y_mean",
                                   broadcastable=(True, False))
        y_std_var = theano.shared(np.ones((1, output_dim)),
                                  name="y_std",
                                  broadcastable=(True, False))

        normalized_xs_var = (xs_var - x_mean_var) / x_std_var
        normalized_ys_var = (ys_var - y_mean_var) / y_std_var

        normalized_means_var = L.get_output(
            l_mean, {mean_network.input_layer: normalized_xs_var})
        normalized_log_stds_var = L.get_output(
            l_log_std, {mean_network.input_layer: normalized_xs_var})

        means_var = normalized_means_var * y_std_var + y_mean_var
        log_stds_var = normalized_log_stds_var + TT.log(y_std_var)

        normalized_old_means_var = (old_means_var - y_mean_var) / y_std_var
        normalized_old_log_stds_var = old_log_stds_var - TT.log(y_std_var)

        dist = self._dist = DiagonalGaussian()

        normalized_dist_info_vars = dict(mean=normalized_means_var,
                                         log_std=normalized_log_stds_var)

        mean_kl = TT.mean(
            dist.kl_sym(
                dict(mean=normalized_old_means_var,
                     log_std=normalized_old_log_stds_var),
                normalized_dist_info_vars,
            ))

        loss = -TT.mean(
            dist.log_likelihood_sym(normalized_ys_var,
                                    normalized_dist_info_vars))

        self._f_predict = compile_function([xs_var], means_var)
        self._f_pdists = compile_function([xs_var], [means_var, log_stds_var])
        self._l_mean = l_mean
        self._l_log_std = l_log_std

        optimizer_args = dict(
            loss=loss,
            target=self,
            network_outputs=[normalized_means_var, normalized_log_stds_var],
        )

        if use_trust_region:
            optimizer_args["leq_constraint"] = (mean_kl, step_size)
            optimizer_args["inputs"] = [
                xs_var, ys_var, old_means_var, old_log_stds_var
            ]
        else:
            optimizer_args["inputs"] = [xs_var, ys_var]

        self._optimizer.update_opt(**optimizer_args)

        self._use_trust_region = use_trust_region
        self._name = name

        self._normalize_inputs = normalize_inputs
        self._normalize_outputs = normalize_outputs
        self._x_mean_var = x_mean_var
        self._x_std_var = x_std_var
        self._y_mean_var = y_mean_var
        self._y_std_var = y_std_var
    def __init__(
            self,
            env_spec,
            env,
            pkl_path=None,
            json_path=None,
            npz_path=None,
            trainable_snn=True,
            ##CF - latents units at the input
            latent_dim=3,  # we keep all these as the dim of the output of the other MLP and others that we will need!
            latent_name='categorical',
            bilinear_integration=False,  # again, needs to match!
            resample=False,  # this can change: frequency of resampling the latent?
            hidden_sizes_snn=(32, 32),
            hidden_sizes_selector=(10, 10),
            external_latent=False,
            learn_std=True,
            init_std=1.0,
            adaptive_std=False,
            std_share_network=False,
            std_hidden_sizes=(32, 32),
            std_hidden_nonlinearity=NL.tanh,
            hidden_nonlinearity=NL.tanh,
            output_nonlinearity=None,
            min_std=1e-4,
    ):
        self.latent_dim = latent_dim  ## could I avoid needing this self for the get_action?
        self.latent_name = latent_name
        self.bilinear_integration = bilinear_integration
        self.resample = resample
        self.min_std = min_std
        self.hidden_sizes_snn = hidden_sizes_snn
        self.hidden_sizes_selector = hidden_sizes_selector

        self.pre_fix_latent = np.array([])  # if this is not empty when using reset() it will use this latent
        self.latent_fix = np.array([])  # this will hold the latents variable sampled in reset()
        self.shared_latent_var = theano.shared(self.latent_fix)  # this is for external lat! update that
        self._set_std_to_0 = False

        self.trainable_snn = trainable_snn
        self.external_latent = external_latent
        self.pkl_path = pkl_path
        self.json_path = json_path
        self.npz_path = npz_path
        self.old_policy = None

        if self.json_path:  # there is another one after defining all the NN to warm-start the params of the SNN
            data = json.load(
                open(os.path.join(config.PROJECT_PATH, self.json_path), 'r'))  # I should do this with the json file
            self.old_policy_json = data['json_args']["policy"]
            self.latent_dim = self.old_policy_json['latent_dim']
            self.latent_name = self.old_policy_json['latent_name']
            self.bilinear_integration = self.old_policy_json['bilinear_integration']
            self.resample = self.old_policy_json['resample']  # this could not be needed...
            self.min_std = self.old_policy_json['min_std']
            self.hidden_sizes_snn = self.old_policy_json['hidden_sizes']
        elif self.pkl_path:
            data = joblib.load(os.path.join(config.PROJECT_PATH, self.pkl_path))
            self.old_policy = data["policy"]
            self.latent_dim = self.old_policy.latent_dim
            self.latent_name = self.old_policy.latent_name
            self.bilinear_integration = self.old_policy.bilinear_integration
            self.resample = self.old_policy.resample  # this could not be needed...
            self.min_std = self.old_policy.min_std
            self.hidden_sizes_snn = self.old_policy.hidden_sizes

        if self.latent_name == 'normal':
            self.latent_dist = DiagonalGaussian(self.latent_dim)
            self.latent_dist_info = dict(mean=np.zeros(self.latent_dim), log_std=np.zeros(self.latent_dim))
        elif self.latent_name == 'bernoulli':
            self.latent_dist = Bernoulli(self.latent_dim)
            self.latent_dist_info = dict(p=0.5 * np.ones(self.latent_dim))
        elif self.latent_name == 'categorical':
            self.latent_dist = Categorical(self.latent_dim)
            if self.latent_dim > 0:
                self.latent_dist_info = dict(prob=1. / self.latent_dim * np.ones(self.latent_dim))
            else:
                self.latent_dist_info = dict(prob=np.ones(self.latent_dim))  # this is an empty array
        else:
            raise NotImplementedError

        Serializable.quick_init(self, locals())
        assert isinstance(env_spec.action_space, Box)

        # retrieve dimensions and check consistency
        if isinstance(env, MazeEnv) or isinstance(env, GatherEnv):
            self.obs_robot_dim = env.robot_observation_space.flat_dim
            self.obs_maze_dim = env.maze_observation_space.flat_dim
        elif isinstance(env, NormalizedEnv):
            if isinstance(env.wrapped_env, MazeEnv) or isinstance(env.wrapped_env, GatherEnv):
                self.obs_robot_dim = env.wrapped_env.robot_observation_space.flat_dim
                self.obs_maze_dim = env.wrapped_env.maze_observation_space.flat_dim
            else:
                self.obs_robot_dim = env.wrapped_env.observation_space.flat_dim
                self.obs_maze_dim = 0
        else:
            self.obs_robot_dim = env.observation_space.flat_dim
            self.obs_maze_dim = 0
        # print("the dims of the env are(rob/maze): ", self.obs_robot_dim, self.obs_maze_dim)
        all_obs_dim = env_spec.observation_space.flat_dim
        assert all_obs_dim == self.obs_robot_dim + self.obs_maze_dim

        if self.external_latent:  # in case we want to fix the latent externally
            l_all_obs_var = L.InputLayer(shape=(None,) + (self.obs_robot_dim + self.obs_maze_dim,))
            all_obs_var = l_all_obs_var.input_var
            # l_selection = ConstOutputLayer(incoming=l_all_obs_var, output_var=self.shared_latent_var)
            l_selection = ParamLayer(incoming=l_all_obs_var, num_units=self.latent_dim, param=self.shared_latent_var,
                                     trainable=False) # Rui: change False to True? this is a simple layer that directly outputs self.shared_latent_var
            selection_var = L.get_output(l_selection)

        else:
            # create network with softmax output: it will be the latent 'selector'!
            latent_selection_network = MLP(
                input_shape=(self.obs_robot_dim + self.obs_maze_dim,),
                output_dim=self.latent_dim,
                hidden_sizes=self.hidden_sizes_selector,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=NL.softmax,
            )
            l_all_obs_var = latent_selection_network.input_layer
            all_obs_var = latent_selection_network.input_layer.input_var

            # collect the output to select the behavior of the robot controller (equivalent to latents)
            l_selection = latent_selection_network.output_layer
            selection_var = L.get_output(l_selection)

        # split all_obs into the robot and the maze obs --> ROBOT goes first!!
        l_obs_robot = CropLayer(l_all_obs_var, start_index=None, end_index=self.obs_robot_dim)
        l_obs_maze = CropLayer(l_all_obs_var, start_index=self.obs_robot_dim, end_index=None)
        # for _ in range(10):
        #     print("OK!")
        # print(self.obs_robot_dim)
        # print(self.obs_maze_dim)

        obs_robot_var = all_obs_var[:, :self.obs_robot_dim]
        obs_maze_var = all_obs_var[:, self.obs_robot_dim:]

        # Enlarge obs with the selectors (or latents). Here just computing the final input dim
        if self.bilinear_integration:
            l_obs_snn = BilinearIntegrationLayer([l_obs_robot, l_selection])
        else:
            l_obs_snn = L.ConcatLayer([l_obs_robot, l_selection])

        action_dim = env_spec.action_space.flat_dim

        # create the action network
        mean_network = MLP(
            input_layer=l_obs_snn,  # input the layer that handles the integration of the selector
            output_dim=action_dim,
            hidden_sizes=self.hidden_sizes_snn,
            hidden_nonlinearity=hidden_nonlinearity,
            output_nonlinearity=output_nonlinearity,
            name="meanMLP",
        )

        self._layers_mean = mean_network.layers
        l_mean = mean_network.output_layer

        if adaptive_std:
            log_std_network = MLP(
                input_layer=l_obs_snn,
                output_dim=action_dim,
                hidden_sizes=std_hidden_sizes,
                hidden_nonlinearity=std_hidden_nonlinearity,
                output_nonlinearity=None,
                name="log_stdMLP"
            )
            l_log_std = log_std_network.output_layer
            self._layers_log_std = log_std_network.layers
        else:
            l_log_std = ParamLayer(
                incoming=mean_network.input_layer,
                num_units=action_dim,
                param=lasagne.init.Constant(np.log(init_std)),
                name="output_log_std",
                trainable=learn_std,
            )
            self._layers_log_std = [l_log_std]

        self._layers_snn = self._layers_mean + self._layers_log_std  # this returns a list with the "snn" layers

        if not self.trainable_snn:
            for layer in self._layers_snn:
                for param, tags in layer.params.items():  # params of layer are OrDict: key=the shared var, val=tags
                    tags.remove("trainable")

        if self.json_path and self.npz_path:
            warm_params_dict = dict(np.load(os.path.join(config.PROJECT_PATH, self.npz_path)))
            # keys = list(param_dict.keys())
            self.set_params_snn(warm_params_dict)
        elif self.pkl_path:
            data = joblib.load(os.path.join(config.PROJECT_PATH, self.pkl_path))
            warm_params = data['policy'].get_params_internal()
            self.set_params_snn(warm_params)

        mean_var, log_std_var = L.get_output([l_mean, l_log_std])

        if self.min_std is not None:
            log_std_var = TT.maximum(log_std_var, np.log(self.min_std))

        self._l_mean = l_mean
        self._l_log_std = l_log_std

        self._dist = DiagonalGaussian(action_dim)

        LasagnePowered.__init__(self, [l_mean, l_log_std])
        super(GaussianMLPPolicy_snn_hier, self).__init__(env_spec)

        # debug
        obs_snn_var = L.get_output(l_obs_snn)
        self._l_obs_snn = ext.compile_function(
            inputs=[all_obs_var],
            outputs=obs_snn_var,
        )
        # self._log_std = ext.compile_function(
        #     inputs=[all_obs_var],
        #     outputs=log_std_var,
        # )
        self._mean = ext.compile_function(
            inputs=[all_obs_var],
            outputs=mean_var,
        )

        self._f_dist = ext.compile_function(
            inputs=[all_obs_var],
            outputs=[mean_var, log_std_var],
        )
        # if I want to monitor the selector output
        self._f_select = ext.compile_function(
            inputs=[all_obs_var],
            outputs=selection_var,
        )
Exemplo n.º 21
0
    def __init__(
        self,
        input_shape,
        output_dim,
        predict_all=True,
        hidden_sizes=(32, 32),
        hidden_nonlinearity=NL.rectify,
        optimizer=None,
        use_trust_region=True,
        step_size=0.01,
        normalize_inputs=True,
        name=None,
    ):
        """
        :param input_shape: Shape of the input data.
        :param output_dim: Dimension of output.
        :param hidden_sizes: Number of hidden units of each layer of the mean network.
        :param hidden_nonlinearity: Non-linearity used for each layer of the mean network.
        :param optimizer: Optimizer for minimizing the negative log-likelihood.
        :param use_trust_region: Whether to use trust region constraint.
        :param step_size: KL divergence constraint for each iteration
        """
        Serializable.quick_init(self, locals())

        if optimizer is None:
            if use_trust_region:
                optimizer = PenaltyLbfgsOptimizer()
            else:
                optimizer = LbfgsOptimizer()

        self.output_dim = output_dim
        self._optimizer = optimizer

        p_network = MLP(
            input_shape=input_shape,
            output_dim=output_dim,
            hidden_sizes=hidden_sizes,
            hidden_nonlinearity=hidden_nonlinearity,
            output_nonlinearity=NL.sigmoid,
        )

        l_p = p_network.output_layer

        LasagnePowered.__init__(self, [l_p])

        xs_var = p_network.input_layer.input_var

        ys_var = TT.imatrix("ys")
        old_p_var = TT.matrix("old_p")
        x_mean_var = theano.shared(np.zeros((1, ) + input_shape),
                                   name="x_mean",
                                   broadcastable=(True, ) +
                                   (False, ) * len(input_shape))
        x_std_var = theano.shared(np.ones((1, ) + input_shape),
                                  name="x_std",
                                  broadcastable=(True, ) +
                                  (False, ) * len(input_shape))

        normalized_xs_var = (xs_var - x_mean_var) / x_std_var

        p_var = L.get_output(l_p, {p_network.input_layer: normalized_xs_var})

        old_info_vars = dict(p=old_p_var)
        info_vars = dict(
            p=p_var
        )  # posterior of the latent at every step, wrt obs-act. Same along batch if recurrent

        dist = self._dist = Bernoulli(output_dim)

        mean_kl = TT.mean(dist.kl_sym(old_info_vars, info_vars))
        self._mean_kl = ext.compile_function(
            [xs_var, old_p_var], mean_kl)  # if not using TR, still log KL

        loss = -TT.mean(dist.log_likelihood_sym(
            ys_var,
            info_vars))  # regressor just wants to min -loglik of data ys

        predicted = p_var >= 0.5  # this gives 0 or 1, depending what is closer to the p_var

        self._f_predict = ext.compile_function([xs_var], predicted)
        self._f_p = ext.compile_function(
            [xs_var], p_var
        )  # for consistency with gauss_mlp_reg this should be ._f_pdists
        self._l_p = l_p

        optimizer_args = dict(
            loss=loss,
            target=self,
            network_outputs=[p_var],
        )

        if use_trust_region:
            optimizer_args["leq_constraint"] = (mean_kl, step_size)
            optimizer_args["inputs"] = [xs_var, ys_var, old_p_var]
        else:
            optimizer_args["inputs"] = [xs_var, ys_var]

        self._optimizer.update_opt(**optimizer_args)

        self._use_trust_region = use_trust_region
        self._name = name

        self._normalize_inputs = normalize_inputs
        self._x_mean_var = x_mean_var
        self._x_std_var = x_std_var
    def __init__(
        self,
        env_spec,
        env,
        pkl_paths=(),
        json_paths=(),
        npz_paths=(),
        trainable_old=True,
        external_selector=False,
        hidden_sizes_selector=(10, 10),
        learn_std=True,
        init_std=1.0,
        adaptive_std=False,
        std_share_network=False,
        std_hidden_sizes=(32, 32),
        std_hidden_nonlinearity=NL.tanh,
        hidden_nonlinearity=NL.tanh,
        output_nonlinearity=None,
        min_std=1e-4,
    ):
        """
        :param pkl_paths: tuple/list of pkl paths
        :param json_paths: tuple/list of json paths
        :param npz_paths: tuple/list of npz paths
        :param trainable_old: Are the old policies still trainable
        :param external_selector: is the linear combination of the old policies outputs fixed externally
        :param hidden_sizes: list of sizes for the fully-connected hidden layers
        :param learn_std: Is std trainable
        :param init_std: Initial std
        :param adaptive_std:
        :param std_share_network:
        :param std_hidden_sizes: list of sizes for the fully-connected layers for std
        :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues
        :param std_hidden_nonlinearity:
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param output_nonlinearity: nonlinearity for the output layer
        :param mean_network: custom network for the output mean
        :param std_network: custom network for the output log std
        """
        # define where are the old policies to use and what to do with them:
        self.trainable_old = trainable_old  # whether to keep training the old policies loaded here
        self.pkl_paths = pkl_paths
        self.json_paths = json_paths
        self.npz_paths = npz_paths
        self.selector_dim = max(
            len(json_paths), len(pkl_paths))  # pkl could be zero if giving npz
        # if not use a selector NN here, just externally fixed selector variable:
        self.external_selector = external_selector  # whether to use the selectorNN defined here or the pre_fix_selector
        self.pre_fix_selector = np.zeros(
            (self.selector_dim)
        )  # if this is not empty when using reset() it will use this selector
        self.selector_fix = np.zeros(
            (self.selector_dim
             ))  # this will hold the selectors variable sampled in reset()
        self.shared_selector_var = theano.shared(
            self.selector_fix)  # this is for external selector! update that
        # else, describe the MLP used:
        self.hidden_sizes_selector = hidden_sizes_selector  # size of the selector NN defined here
        self.min_std = min_std
        self._set_std_to_0 = False

        self.action_dim = env_spec.action_space.flat_dim  # not checking that all the old policies have this act_dim

        self.old_hidden_sizes = []
        # assume json always given
        for json_path in self.json_paths:
            data = json.load(
                open(os.path.join(config.PROJECT_PATH, json_path), 'r'))
            old_json_policy = data['json_args']["policy"]
            self.old_hidden_sizes.append(old_json_policy['hidden_sizes'])

        # retrieve dimensions and check consistency
        if isinstance(env, MazeEnv) or isinstance(env, GatherEnv):
            self.obs_robot_dim = env.robot_observation_space.flat_dim
            self.obs_maze_dim = env.maze_observation_space.flat_dim
        elif isinstance(env, NormalizedEnv):
            if isinstance(env.wrapped_env, MazeEnv) or isinstance(
                    env.wrapped_env, GatherEnv):
                self.obs_robot_dim = env.wrapped_env.robot_observation_space.flat_dim
                self.obs_maze_dim = env.wrapped_env.maze_observation_space.flat_dim
            else:
                self.obs_robot_dim = env.wrapped_env.observation_space.flat_dim
                self.obs_maze_dim = 0
        else:
            self.obs_robot_dim = env.observation_space.flat_dim
            self.obs_maze_dim = 0
        # print("the dims of the env are(rob/maze): ", self.obs_robot_dim, self.obs_maze_dim)
        all_obs_dim = env_spec.observation_space.flat_dim
        assert all_obs_dim == self.obs_robot_dim + self.obs_maze_dim
        Serializable.quick_init(self, locals())
        assert isinstance(env_spec.action_space, Box)

        if self.external_selector:  # in case we want to fix the selector externally
            l_all_obs_var = L.InputLayer(
                shape=(None, ) + (self.obs_robot_dim + self.obs_maze_dim, ))
            all_obs_var = l_all_obs_var.input_var
            l_selection = ParamLayer(incoming=l_all_obs_var,
                                     num_units=self.selector_dim,
                                     param=self.shared_selector_var,
                                     trainable=False)
            selection_var = L.get_output(l_selection)
        else:
            # create network with softmax output: it will be the selector!
            selector_network = MLP(
                input_shape=(self.obs_robot_dim + self.obs_maze_dim, ),
                output_dim=self.selector_dim,
                hidden_sizes=self.hidden_sizes_selector,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=NL.softmax,
            )
            l_all_obs_var = selector_network.input_layer
            all_obs_var = selector_network.input_layer.input_var

            # collect the output to select the behavior of the robot controller (equivalent to selectors)
            l_selection = selector_network.output_layer
            selection_var = L.get_output(l_selection)

        # split all_obs into the robot and the maze obs --> ROBOT goes first!!
        l_obs_robot = CropLayer(l_all_obs_var,
                                start_index=None,
                                end_index=self.obs_robot_dim)
        l_obs_maze = CropLayer(l_all_obs_var,
                               start_index=self.obs_robot_dim,
                               end_index=None)

        obs_robot_var = all_obs_var[:, :self.obs_robot_dim]
        obs_maze_var = all_obs_var[:, self.obs_robot_dim:]

        # create the action networks
        self.old_l_means = [
        ]  # I do this self in case I wanna access it from reset
        self.old_l_log_stds = []
        self.old_layers = []
        for i in range(self.selector_dim):
            mean_network = MLP(
                input_layer=l_obs_robot,
                output_dim=self.action_dim,
                hidden_sizes=self.old_hidden_sizes[i],
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=output_nonlinearity,
                name="meanMLP{}".format(i),
            )
            self.old_l_means.append(mean_network.output_layer)
            self.old_layers += mean_network.layers

            l_log_std = ParamLayer(
                incoming=mean_network.input_layer,
                num_units=self.action_dim,
                param=lasagne.init.Constant(np.log(init_std)),
                name="output_log_std{}".format(i),
                trainable=learn_std,
            )
            self.old_l_log_stds.append(l_log_std)
            self.old_layers += [l_log_std]

        if not self.trainable_old:
            for layer in self.old_layers:
                for param, tags in layer.params.items(
                ):  # params of layer are OrDict: key=the shared var, val=tags
                    tags.remove("trainable")

        if self.json_paths and self.npz_paths:
            old_params_dict = {}
            for i, npz_path in enumerate(self.npz_paths):
                params_dict = dict(
                    np.load(os.path.join(config.PROJECT_PATH, npz_path)))
                renamed_warm_params_dict = {}
                for key in params_dict.keys():
                    if key == 'output_log_std.param':
                        old_params_dict['output_log_std{}.param'.format(
                            i)] = params_dict[key]
                    elif 'meanMLP_' == key[:8]:
                        old_params_dict['meanMLP{}_'.format(i) +
                                        key[8:]] = params_dict[key]
                    else:
                        old_params_dict['meanMLP{}_'.format(i) +
                                        key] = params_dict[key]
            self.set_old_params(old_params_dict)

        elif self.pkl_paths:
            old_params_dict = {}
            for i, pkl_path in enumerate(self.pkl_paths):
                data = joblib.load(os.path.join(config.PROJECT_PATH, pkl_path))
                params = data['policy'].get_params_internal()
                for param in params:
                    if param.name == 'output_log_std.param':
                        old_params_dict['output_log_std{}.param'.format(
                            i)] = param.get_value()
                    elif 'meanMLP_' == param.name[:8]:
                        old_params_dict['meanMLP{}_'.format(i) +
                                        param.name[8:]] = param.get_value()
                    else:
                        old_params_dict['meanMLP{}_'.format(i) +
                                        param.name] = param.get_value()
            self.set_old_params(old_params_dict)

        # new layers actually selecting the correct output
        l_mean = SumProdLayer(self.old_l_means + [l_selection])
        l_log_std = SumProdLayer(self.old_l_log_stds + [l_selection])
        mean_var, log_std_var = L.get_output([l_mean, l_log_std])

        if self.min_std is not None:
            log_std_var = TT.maximum(log_std_var, np.log(self.min_std))

        self._l_mean = l_mean
        self._l_log_std = l_log_std

        self._dist = DiagonalGaussian(self.action_dim)

        LasagnePowered.__init__(self, [l_mean, l_log_std])
        super(GaussianMLPPolicy_multi_hier, self).__init__(env_spec)

        self._f_old_means = ext.compile_function(
            inputs=[all_obs_var],
            outputs=[
                L.get_output(l_old_mean) for l_old_mean in self.old_l_means
            ])

        self._f_all_inputs = ext.compile_function(
            inputs=[all_obs_var],
            outputs=[
                L.get_output(l_old_mean) for l_old_mean in self.old_l_means
            ] + [selection_var])

        self._f_dist = ext.compile_function(
            inputs=[all_obs_var],
            outputs=[mean_var, log_std_var],
        )
        # if I want to monitor the selector output
        self._f_select = ext.compile_function(
            inputs=[all_obs_var],
            outputs=selection_var,
        )
Exemplo n.º 23
0
    def __init__(
            self,
            env_spec,
            hidden_sizes=(32, 32),
            learn_std=True,
            init_std=1.0,
            adaptive_std=False,
            std_share_network=False,
            std_hidden_sizes=(32, 32),
            min_std=1e-6,
            std_hidden_nonlinearity=NL.tanh,
            hidden_nonlinearity=NL.tanh,
            output_nonlinearity=None,
            mean_network=None,
            std_network=None,
            dist_cls=DiagonalGaussian,
    ):
        """
        :param env_spec:
        :param hidden_sizes: list of sizes for the fully-connected hidden layers
        :param learn_std: Is std trainable
        :param init_std: Initial std
        :param adaptive_std:
        :param std_share_network:
        :param std_hidden_sizes: list of sizes for the fully-connected layers for std
        :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues
        :param std_hidden_nonlinearity:
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param output_nonlinearity: nonlinearity for the output layer
        :param mean_network: custom network for the output mean
        :param std_network: custom network for the output log std
        :return:
        """
        Serializable.quick_init(self, locals())
        assert isinstance(env_spec.action_space, Discrete)

        #obs_dim = env_spec.observation_space.flat_dim
        obs_dim = 6400
        action_dim = env_spec.action_space.flat_dim

        # create network
        if mean_network is None:
            mean_network = MLP(
                input_shape=(obs_dim,),
                output_dim=action_dim,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=output_nonlinearity,
            )
        self._mean_network = mean_network

        l_mean = mean_network.output_layer
        obs_var = mean_network.input_layer.input_var

        if std_network is not None:
            l_log_std = std_network.output_layer
        else:
            if adaptive_std:
                std_network = MLP(
                    input_shape=(obs_dim,),
                    input_layer=mean_network.input_layer,
                    output_dim=action_dim,
                    hidden_sizes=std_hidden_sizes,
                    hidden_nonlinearity=std_hidden_nonlinearity,
                    output_nonlinearity=None,
                )
                l_log_std = std_network.output_layer
            else:
                l_log_std = ParamLayer(
                    mean_network.input_layer,
                    num_units=action_dim,
                    param=lasagne.init.Constant(np.log(init_std)),
                    name="output_log_std",
                    trainable=learn_std,
                )

        self.min_std = min_std

        mean_var, log_std_var = L.get_output([l_mean, l_log_std])

        if self.min_std is not None:
            log_std_var = TT.maximum(log_std_var, np.log(min_std))

        self._mean_var, self._log_std_var = mean_var, log_std_var

        self._l_mean = l_mean
        self._l_log_std = l_log_std

        self._dist = dist_cls(action_dim)

        LasagnePowered.__init__(self, [l_mean, l_log_std])
        super(GaussianMLPPolicy, self).__init__(env_spec)

        self._f_dist = ext.compile_function(
            inputs=[obs_var],
            outputs=[mean_var, log_std_var],
        )
Exemplo n.º 24
0

##suggested by visak, method for simplifying the representation of the baseline NN
expDict = tFuncs.buildExpDict()
env, polDict, trainDict = tFuncs.buildExperiment(expDict)
baseline=polDict['baseline']

blLayerShapes = baseline._regressor.get_param_shapes()

blParams = L.get_all_param_values(baseline._regressor._mean_network.output_layer)

    
from rllab.core.network import MLP
net = MLP(input_shape=(blLayerShapes[0][0],),
            output_dim=1,
            hidden_sizes=expDict['mlpArch'],
            hidden_nonlinearity=lasagne.nonlinearities.rectify,
            output_nonlinearity=None,
            )
    
L.set_all_param_values(net.output_layer,blParams)
X = net.input_layer.input_var

pred = L.get_output(net.output_layer,deterministic=True)
valueFunc = theano.function([X],pred)



#Third : You can then just query the value of a state using this

vf = valueFunc(observations)
    def __init__(
        self,
        env_spec,
        hidden_sizes=(32, 32),
        learn_std=True,
        init_std=1.0,
        adaptive_std=False,
        std_share_network=False,
        std_hidden_sizes=(32, 32),
        min_std=1e-6,
        npz_path=None,
        freeze_lst=None,
        reinit_lst=None,
        std_hidden_nonlinearity=NL.tanh,
        hidden_nonlinearity=NL.tanh,
        output_nonlinearity=None,
        mean_network=None,
        std_network=None,
        dist_cls=DiagonalGaussian,
    ):
        """
        :param env_spec:
        :param hidden_sizes: list of sizes for the fully-connected hidden layers
        :param learn_std: Is std trainable
        :param init_std: Initial std
        :param adaptive_std:
        :param std_share_network:
        :param std_hidden_sizes: list of sizes for the fully-connected layers for std
        :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues
        :param std_hidden_nonlinearity:
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param output_nonlinearity: nonlinearity for the output layer
        :param mean_network: custom network for the output mean
        :param std_network: custom network for the output log std
        :return:
        """
        Serializable.quick_init(self, locals())
        # reinit_lst = None
        assert isinstance(env_spec.action_space, Box)
        if init_std is None:
            init_std = 1.0
            set_std_params = False
        else:
            set_std_params = True

        obs_dim = env_spec.observation_space.flat_dim
        action_dim = env_spec.action_space.flat_dim

        # create network
        if mean_network is None:
            mean_network = MLP(
                input_shape=(obs_dim, ),
                output_dim=action_dim,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=output_nonlinearity,
            )
        self._mean_network = mean_network
        self._layers_mean = mean_network.layers

        l_mean = mean_network.output_layer
        obs_var = mean_network.input_layer.input_var

        if std_network is not None:
            l_log_std = std_network.output_layer
        else:
            if adaptive_std:
                std_network = MLP(
                    input_shape=(obs_dim, ),
                    input_layer=mean_network.input_layer,
                    output_dim=action_dim,
                    hidden_sizes=std_hidden_sizes,
                    hidden_nonlinearity=std_hidden_nonlinearity,
                    output_nonlinearity=None,
                )
                l_log_std = std_network.output_layer
                self._layers_log_std = std_network.layers
            else:
                l_log_std = ParamLayer(
                    mean_network.input_layer,
                    num_units=action_dim,
                    param=lasagne.init.Constant(np.log(init_std)),
                    name="output_log_std",
                    trainable=learn_std,
                )
                self._layers_log_std = [l_log_std]

        self._layers = self._layers_mean + self._layers_log_std

        self.min_std = min_std

        mean_var, log_std_var = L.get_output([l_mean, l_log_std])

        if self.min_std is not None:
            log_std_var = TT.maximum(log_std_var, np.log(min_std))

        self._mean_var, self._log_std_var = mean_var, log_std_var

        self._l_mean = l_mean
        self._l_log_std = l_log_std

        self._dist = dist_cls(action_dim)

        LasagnePowered.__init__(self, [l_mean, l_log_std])
        super(GaussianMLPPolicy, self).__init__(env_spec)

        self._f_dist = ext.compile_function(
            inputs=[obs_var],
            outputs=[mean_var, log_std_var],
        )

        if npz_path is not None:
            param_dict = dict(
                np.load(os.path.join(config.PROJECT_PATH, npz_path)))
            param_values = param_dict['params']
            # todo: don't forget about this
            if set_std_params:
                self.set_param_values(param_values)
            else:
                self.set_param_values_transfer(param_values)

        if freeze_lst is not None:
            assert len(freeze_lst) == len(self._layers) - 1
            for layer, should_freeze in zip(self._layers[1:], freeze_lst):
                if should_freeze:
                    for param, tags in layer.params.items():
                        tags.remove("trainable")
        if reinit_lst is not None:
            assert len(freeze_lst) == len(
                self._layers) - 1  # since input layer is counted
            for layer, should_reinit in zip(self._layers[1:], reinit_lst):
                if should_reinit:
                    print("reinitialized")
                    for v in layer.params:
                        val = v.get_value()
                        if (len(val.shape) < 2):
                            v.set_value(lasagne.init.Constant(0.0)(val.shape))
                        else:
                            v.set_value(lasagne.init.GlorotUniform()(
                                val.shape))
                else:
                    print("did not reinit")
Exemplo n.º 26
0
    def __init__(
            self,
            input_shape,
            output_dim,
            prob_network=None,
            hidden_sizes=(32, 32),
            hidden_nonlinearity=NL.rectify,
            optimizer=None,
            use_trust_region=True,
            step_size=0.01,
            normalize_inputs=True,
            name=None,
    ):
        """
        :param input_shape: Shape of the input data.
        :param output_dim: Dimension of output.
        :param hidden_sizes: Number of hidden units of each layer of the mean network.
        :param hidden_nonlinearity: Non-linearity used for each layer of the mean network.
        :param optimizer: Optimizer for minimizing the negative log-likelihood.
        :param use_trust_region: Whether to use trust region constraint.
        :param step_size: KL divergence constraint for each iteration
        """
        Serializable.quick_init(self, locals())

        if optimizer is None:
            if use_trust_region:
                optimizer = PenaltyLbfgsOptimizer()
            else:
                optimizer = LbfgsOptimizer()

        self.output_dim = output_dim
        self._optimizer = optimizer

        if prob_network is None:
            prob_network = MLP(
                input_shape=input_shape,
                output_dim=output_dim,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=NL.softmax,
            )

        l_prob = prob_network.output_layer

        LasagnePowered.__init__(self, [l_prob])

        xs_var = prob_network.input_layer.input_var
        ys_var = TT.imatrix("ys")
        old_prob_var = TT.matrix("old_prob")

        x_mean_var = theano.shared(
            np.zeros((1,) + input_shape),
            name="x_mean",
            broadcastable=(True,) + (False,) * len(input_shape)
        )
        x_std_var = theano.shared(
            np.ones((1,) + input_shape),
            name="x_std",
            broadcastable=(True,) + (False,) * len(input_shape)
        )

        normalized_xs_var = (xs_var - x_mean_var) / x_std_var

        prob_var = L.get_output(l_prob, {prob_network.input_layer: normalized_xs_var})

        old_info_vars = dict(prob=old_prob_var)
        info_vars = dict(prob=prob_var)

        dist = self._dist = Categorical(output_dim)

        mean_kl = TT.mean(dist.kl_sym(old_info_vars, info_vars))

        loss = - TT.mean(dist.log_likelihood_sym(ys_var, info_vars))

        predicted = special.to_onehot_sym(TT.argmax(prob_var, axis=1), output_dim)

        self._f_predict = ext.compile_function([xs_var], predicted)
        self._f_prob = ext.compile_function([xs_var], prob_var)
        self._prob_network = prob_network
        self._l_prob = l_prob

        optimizer_args = dict(
            loss=loss,
            target=self,
            network_outputs=[prob_var],
        )

        if use_trust_region:
            optimizer_args["leq_constraint"] = (mean_kl, step_size)
            optimizer_args["inputs"] = [xs_var, ys_var, old_prob_var]
        else:
            optimizer_args["inputs"] = [xs_var, ys_var]

        self._optimizer.update_opt(**optimizer_args)

        self._use_trust_region = use_trust_region
        self._name = name

        self._normalize_inputs = normalize_inputs
        self._x_mean_var = x_mean_var
        self._x_std_var = x_std_var
Exemplo n.º 27
0
def create_policy_and_env(env, seed, policy, policy_file):
    # Session
    sess = U.single_threaded_session()
    sess.__enter__()
    '''
    # Create the environment
    if env.startswith('rllab.'):
        # Get env name and class
        env_name = re.match('rllab.(\S+)', env).group(1)
        env_rllab_class = rllab_env_from_name(env_name)
        # Define env maker
        def make_env():
            env_rllab = env_rllab_class()
            _env = Rllab2GymWrapper(env_rllab)
            return _env
        # Used later
        env_type = 'rllab'
    else:
        # Normal gym, get if Atari or not.
        env_type = get_env_type(env)
        assert env_type is not None, "Env not recognized."
        # Define the correct env maker
        if env_type == 'atari':
            # Atari, custom env creation
            def make_env():
                _env = make_atari(env)
                return wrap_deepmind(_env)
        else:
            # Not atari, standard env creation
            def make_env():
                env_rllab = gym.make(env)
                return env_rllab
    env = make_env()
    env.seed(seed)
    ob_space = env.observation_space
    ac_space = env.action_space
    '''
    env_class = rllab_env_from_name(env)
    env = normalize(env_class())
    '''
    # Make policy
    if policy == 'linear':
        hid_size = num_hid_layers = 0
    elif policy == 'simple-nn':
        hid_size = [16]
        num_hid_layers = 1
    elif policy == 'nn':
        hid_size = [100, 50, 25]
        num_hid_layers = 3
    # Temp initializer
    policy_initializer = U.normc_initializer(0.0)
    if policy == 'linear' or policy == 'nn' or policy == 'simple-nn':
        def make_policy(name, ob_space, ac_space):
            return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                             hid_size=hid_size, num_hid_layers=num_hid_layers, gaussian_fixed_var=True, use_bias=True, use_critic=False,
                             hidden_W_init=policy_initializer,
                             output_W_init=policy_initializer)
    elif policy == 'cnn':
        def make_policy(name, ob_space, ac_space):
            return CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                         gaussian_fixed_var=True, use_bias=False, use_critic=False,
                         hidden_W_init=policy_initializer,
                         output_W_init=policy_initializer)
    else:
        raise Exception('Unrecognized policy type.')
    pi = make_policy('pi', ob_space, ac_space)
    # Load policy weights from file
    all_var_list = pi.get_trainable_variables()
    var_list = [v for v in all_var_list if v.name.split('/')[1].startswith('pol')]
    set_parameter = U.SetFromFlat(var_list)
    '''
    obs_dim = env.observation_space.flat_dim
    action_dim = env.action_space.flat_dim
    policy_init = 'zeros'
    # Policy initialization
    if policy_init == 'zeros':
        initializer = LI.Constant(0)
    elif policy_init == 'normal':
        initializer = LI.Normal()
    else:
        raise Exception('Unrecognized policy initialization.')

    # Setting the policy type
    if policy == 'linear':
        hidden_sizes = tuple()
    elif policy == 'simple-nn':
        hidden_sizes = [16]
    else:
        raise Exception('NOT IMPLEMENTED.')
    # Creating the policy
    mean_network = MLP(
        input_shape=(obs_dim, ),
        output_dim=action_dim,
        hidden_sizes=[16],
        hidden_nonlinearity=NL.tanh,
        output_nonlinearity=None,
        output_b_init=None,
        output_W_init=initializer,
    )
    policy = GaussianMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=[16],
        mean_network=mean_network)

    #weights = pkl.load(open(policy_file, 'rb'))
    # TMP overriding weights
    #weights = [-0.19337249, -0.12103618, 0.00849289, -0.1105529, -3.6525128] # TRPO
    #weights = [-0.5894, -0.2585, -0.0137, -0.2464, -0.2788] # POIS
    #weights = list(map(float, ['-0.5807', '-0.3046', '-0.0127', '-0.3045', '-0.7427']))
    weights = list(
        map(
            lambda x: x.rstrip(' \r\n')
            if len(x.rstrip(' \r\n')) > 0 else None,
            """0.02483223 -0.17645608  0.77450023  0.54770311  0.33464952 -0.29827444
 -0.62524864  0.46413191 -0.31990006 -0.32972003  0.38753632 -0.15170416
 -0.43518174 -0.15718946  0.19542838 -0.02774486  0.13546377 -0.18621497
  0.18444675  0.774653    0.19710147 -0.20958339  0.15098953  0.42278248
 -0.53121678 -0.33369185 -0.04331141 -0.2140371   0.27077572  0.58111134
  0.34637848  0.56956591  0.45061681 -0.15826946 -1.06925573 -0.39311001
 -0.35695692  0.14414285 -1.25332428 -0.24016012  0.17774961  0.23973508
 -0.65415459  1.53059934 -0.71953132  1.79764386  0.18561774  1.4640445
 -0.1625999   0.0606595  -0.22058723 -0.34247517  0.46232139  0.07013392
 -0.32074007  0.14488911  0.1123158   0.28914362  0.6727726  -0.58491444
  0.35895434  1.32873906 -0.0708237  -0.05147256  0.01689644  0.38244615
  0.10005984  0.71253728 -0.18824528 -0.15552894 -0.05634595  0.3517145
  0.20900426 -0.19631462 -0.03828797  0.08125694 -0.22894259 -0.08030374
  0.59522035 -0.1752422  -0.40809067  1.62409963 -1.39307047  0.81438794
 -0.54068521  0.19321547 -1.65661292  0.3264788   0.46482921 -0.01649974
 -0.79186757 -1.3378886  -0.57094913 -1.57079733 -1.78056839  1.05324632
 -2.14386428""".rstrip(' \r\n').split(' ')))
    weights = [w for w in weights if w is not None]
    weights = list(map(float, weights))

    print(weights)
    #pi.set_param(weights)

    return env, policy
    performances = []
    learning_curves = []
    for i in range(len(split_percentages)):
        learning_curves.append([])

    if not os.path.exists('data/trained/gradient_temp/supervised_split_' + append):
        os.makedirs('data/trained/gradient_temp/supervised_split_' + append)

    average_metric_list = []

    print('======== Start Test ========')

    network = MLP(
            input_shape=(in_dim,),
            output_dim=out_dim,
            hidden_sizes=hidden_size,
            hidden_nonlinearity=NL.tanh,
            output_nonlinearity=None,
        )
    if load_init_policy:
        network = joblib.load('data/trained/gradient_temp/supervised_split_' + append + '/init_network.pkl')

    out_var = TT.matrix('out_var')
    prediction = network._output
    loss = lasagne.objectives.squared_error(prediction, out_var)
    loss = loss.mean()
    params = network.get_params(trainable=True)
    updates = lasagne.updates.adam(loss, params, learning_rate=0.0005)
    train_fn = T.function([network.input_layer.input_var, out_var], loss, updates=updates, allow_input_downcast=True)
    ls = TT.mean((prediction - out_var)**2)
    grad = T.grad(ls, params, disconnected_inputs='warn')
Exemplo n.º 29
0
    def __init__(
        self,
        env_spec,
        hidden_sizes=(),
        learn_std=True,
        init_std=1.0,
        adaptive_std=False,
        std_share_network=False,
        std_hidden_sizes=(32, 32),
        min_std=1e-6,
        std_hidden_nonlinearity=NL.tanh,
        hidden_nonlinearity=NL.tanh,
        output_nonlinearity=None,
        mean_network=None,
        std_network=None,
        dist_cls=DiagonalGaussian,
        hlc_output_dim=0,
        subnet_split1=[2, 3, 4, 11, 12, 13],
        subnet_split2=[5, 6, 7, 14, 15, 16],
        sub_out_dim=3,
        option_dim=4,
    ):

        Serializable.quick_init(self, locals())
        assert isinstance(env_spec.action_space, Box)

        obs_dim = env_spec.observation_space.flat_dim
        action_dim = env_spec.action_space.flat_dim

        # create network
        if mean_network is None:
            mean_network = HMLPPhase(
                hidden_sizes,
                hidden_nonlinearity,
                input_shape=(obs_dim, ),
                subnet_split1=subnet_split1,
                subnet_split2=subnet_split2,
                hlc_output_dim=hlc_output_dim,
                sub_out_dim=sub_out_dim,
                option_dim=option_dim,
            )
        self._mean_network = mean_network

        l_mean = mean_network.output_layer
        obs_var = mean_network.input_layer.input_var

        if std_network is not None:
            l_log_std = std_network.output_layer
        else:
            if adaptive_std:
                std_network = MLP(
                    input_shape=(obs_dim, ),
                    input_layer=mean_network.input_layer,
                    output_dim=action_dim,
                    hidden_sizes=std_hidden_sizes,
                    hidden_nonlinearity=std_hidden_nonlinearity,
                    output_nonlinearity=None,
                )
                l_log_std = std_network.output_layer
            else:
                l_log_std = ParamLayer(
                    mean_network.input_layer,
                    num_units=action_dim,
                    param=lasagne.init.Constant(np.log(init_std)),
                    name="output_log_std",
                    trainable=learn_std,
                )

        self.min_std = min_std

        mean_var, log_std_var = L.get_output([l_mean, l_log_std])

        if self.min_std is not None:
            log_std_var = TT.maximum(log_std_var, np.log(min_std))

        self._mean_var, self._log_std_var = mean_var, log_std_var

        self._l_mean = l_mean
        self._l_log_std = l_log_std

        self._dist = dist_cls(action_dim)

        LasagnePowered.__init__(self, [l_mean, l_log_std])
        super(GaussianMLPPolicy, self).__init__(env_spec)

        self._f_dist = ext.compile_function(
            inputs=[obs_var],
            outputs=[mean_var, log_std_var],
        )

        self._f_dist = ext.compile_function(
            inputs=[obs_var],
            outputs=[mean_var, log_std_var],
        )

        self.hidden_signals = ext.compile_function(
            inputs=[obs_var],
            outputs=[
                mean_network.hlc_signal1, mean_network.hlc_signal2,
                mean_network.leg1_part, mean_network.leg2_part
            ])
Exemplo n.º 30
0
    def __init__(
        self,
        env_spec,
        env,
        latent_dim=2,
        latent_name='bernoulli',
        bilinear_integration=False,
        resample=False,
        hidden_sizes=(32, 32),
        learn_std=True,
        init_std=1.0,
        adaptive_std=False,
        std_share_network=False,
        std_hidden_sizes=(32, 32),
        std_hidden_nonlinearity=NL.tanh,
        hidden_nonlinearity=NL.tanh,
        output_nonlinearity=None,
        min_std=1e-4,
        pkl_path=None,
    ):
        """
        :param latent_dim: dimension of the latent variables
        :param latent_name: distribution of the latent variables
        :param bilinear_integration: Boolean indicator of bilinear integration or simple concatenation
        :param resample: Boolean indicator of resampling at every step or only at the start of the rollout (or whenever
        agent is reset, which can happen several times along the rollout with rollout in utils_snn)
        """
        self.latent_dim = latent_dim  ##could I avoid needing this self for the get_action?
        self.latent_name = latent_name
        self.bilinear_integration = bilinear_integration
        self.resample = resample
        self.min_std = min_std
        self.hidden_sizes = hidden_sizes

        self.pre_fix_latent = np.array(
            []
        )  # if this is not empty when using reset() it will use this latent
        self.latent_fix = np.array(
            [])  # this will hold the latents variable sampled in reset()
        self._set_std_to_0 = False

        self.pkl_path = pkl_path

        if self.pkl_path:
            data = joblib.load(os.path.join(config.PROJECT_PATH,
                                            self.pkl_path))
            self.old_policy = data["policy"]
            self.latent_dim = self.old_policy.latent_dim
            self.latent_name = self.old_policy.latent_name
            self.bilinear_integration = self.old_policy.bilinear_integration
            self.resample = self.old_policy.resample  # this could not be needed...
            self.min_std = self.old_policy.min_std
            self.hidden_sizes_snn = self.old_policy.hidden_sizes

        if latent_name == 'normal':
            self.latent_dist = DiagonalGaussian(self.latent_dim)
            self.latent_dist_info = dict(mean=np.zeros(self.latent_dim),
                                         log_std=np.zeros(self.latent_dim))
        elif latent_name == 'bernoulli':
            self.latent_dist = Bernoulli(self.latent_dim)
            self.latent_dist_info = dict(p=0.5 * np.ones(self.latent_dim))
        elif latent_name == 'categorical':
            self.latent_dist = Categorical(self.latent_dim)
            if self.latent_dim > 0:
                self.latent_dist_info = dict(prob=1. / self.latent_dim *
                                             np.ones(self.latent_dim))
            else:
                self.latent_dist_info = dict(prob=np.ones(self.latent_dim))
        else:
            raise NotImplementedError

        Serializable.quick_init(self, locals())
        assert isinstance(env_spec.action_space, Box)

        # retrieve dimensions from env!
        if isinstance(env, MazeEnv) or isinstance(env, GatherEnv):
            self.obs_robot_dim = env.robot_observation_space.flat_dim
            self.obs_maze_dim = env.maze_observation_space.flat_dim
        elif isinstance(env, NormalizedEnv):
            if isinstance(env.wrapped_env, MazeEnv) or isinstance(
                    env.wrapped_env, GatherEnv):
                self.obs_robot_dim = env.wrapped_env.robot_observation_space.flat_dim
                self.obs_maze_dim = env.wrapped_env.maze_observation_space.flat_dim
            else:
                self.obs_robot_dim = env.wrapped_env.observation_space.flat_dim
                self.obs_maze_dim = 0
        else:
            self.obs_robot_dim = env.observation_space.flat_dim
            self.obs_maze_dim = 0
        # print("the dims of the env are(rob/maze): ", self.obs_robot_dim, self.obs_maze_dim)
        all_obs_dim = env_spec.observation_space.flat_dim
        assert all_obs_dim == self.obs_robot_dim + self.obs_maze_dim

        if self.bilinear_integration:
            obs_dim = self.obs_robot_dim + self.latent_dim +\
                      self.obs_robot_dim * self.latent_dim
        else:
            obs_dim = self.obs_robot_dim + self.latent_dim  # here only if concat.

        action_dim = env_spec.action_space.flat_dim

        # for _ in range(10):
        #     print("OK!")
        # print(obs_dim)
        # print(env_spec.observation_space.flat_dim)
        # print(self.latent_dim)

        mean_network = MLP(
            input_shape=(obs_dim, ),
            output_dim=action_dim,
            hidden_sizes=hidden_sizes,
            hidden_nonlinearity=hidden_nonlinearity,
            output_nonlinearity=output_nonlinearity,
            name="meanMLP",
        )

        self._layers_mean = mean_network.layers
        l_mean = mean_network.output_layer
        obs_var = mean_network.input_layer.input_var

        if adaptive_std:
            log_std_network = MLP(input_shape=(obs_dim, ),
                                  input_var=obs_var,
                                  output_dim=action_dim,
                                  hidden_sizes=std_hidden_sizes,
                                  hidden_nonlinearity=std_hidden_nonlinearity,
                                  output_nonlinearity=None,
                                  name="log_stdMLP")
            l_log_std = log_std_network.output_layer
            self._layers_log_std = log_std_network.layers
        else:
            l_log_std = ParamLayer(
                mean_network.input_layer,
                num_units=action_dim,
                param=lasagne.init.Constant(np.log(init_std)),
                name="output_log_std",
                trainable=learn_std,
            )
            self._layers_log_std = [l_log_std]

        self._layers_snn = self._layers_mean + self._layers_log_std  # this returns a list with the "snn" layers

        if self.pkl_path:  # restore from pkl file
            data = joblib.load(os.path.join(config.PROJECT_PATH,
                                            self.pkl_path))
            warm_params = data['policy'].get_params_internal()
            self.set_params_snn(warm_params)

        mean_var, log_std_var = L.get_output([l_mean, l_log_std])

        if self.min_std is not None:
            log_std_var = TT.maximum(log_std_var, np.log(self.min_std))

        self._l_mean = l_mean
        self._l_log_std = l_log_std

        self._dist = DiagonalGaussian(action_dim)

        LasagnePowered.__init__(self, [l_mean, l_log_std])
        super(GaussianMLPPolicy_snn_restorable, self).__init__(env_spec)

        self._f_dist = ext.compile_function(
            inputs=[obs_var],
            outputs=[mean_var, log_std_var],
        )