Exemplo n.º 1
0
    def __init__(self,
                 in_size,
                 out_size,
                 hidden_sizes,
                 nonlinearity=F.relu,
                 last_wscale=1):
        self.in_size = in_size
        self.out_size = out_size
        self.hidden_sizes = hidden_sizes
        self.nonlinearity = nonlinearity

        super().__init__()
        with self.init_scope():
            if hidden_sizes:
                hidden_layers = []
                hidden_layers.append(L.Linear(in_size, hidden_sizes[0]))
                for hin, hout in zip(hidden_sizes, hidden_sizes[1:]):
                    hidden_layers.append(L.Linear(hin, hout))
                self.hidden_layers = chainer.ChainList(*hidden_layers)
                self.output = L.Linear(hidden_sizes[-1],
                                       out_size,
                                       initialW=LeCunNormal(last_wscale))
            else:
                self.output = L.Linear(in_size,
                                       out_size,
                                       initialW=LeCunNormal(last_wscale))
Exemplo n.º 2
0
    def __init__(self,
                 n_input_channels,
                 action_size,
                 var,
                 n_hidden_layers=0,
                 n_hidden_channels=None,
                 min_action=None,
                 max_action=None,
                 bound_mean=False,
                 nonlinearity=F.relu,
                 mean_wscale=1):

        self.n_input_channels = n_input_channels
        self.action_size = action_size
        self.n_hidden_layers = n_hidden_layers
        self.n_hidden_channels = n_hidden_channels
        self.min_action = min_action
        self.max_action = max_action
        self.bound_mean = bound_mean
        self.nonlinearity = nonlinearity
        if np.isscalar(var):
            self.var = np.full(action_size, var, dtype=np.float32)
        else:
            self.var = var
        layers = []
        if n_hidden_layers > 0:
            # Input to hidden
            layers.append(L.Linear(n_input_channels, n_hidden_channels))
            layers.append(self.nonlinearity)
            for _ in range(n_hidden_layers - 1):
                # Hidden to hidden
                layers.append(L.Linear(n_hidden_channels, n_hidden_channels))
                layers.append(self.nonlinearity)
            # The last layer is used to compute the mean
            layers.append(
                L.Linear(n_hidden_channels,
                         action_size,
                         initialW=LeCunNormal(mean_wscale)))
        else:
            # There's only one layer for computing the mean
            layers.append(
                L.Linear(n_input_channels,
                         action_size,
                         initialW=LeCunNormal(mean_wscale)))

        if self.bound_mean:
            layers.append(
                lambda x: bound_by_tanh(x, self.min_action, self.max_action))

        def get_var_array(shape):
            self.var = self.xp.asarray(self.var)
            return self.xp.broadcast_to(self.var, shape)

        layers.append(lambda x: distribution.GaussianDistribution(
            x, get_var_array(x.shape)))
        super().__init__(*layers)
Exemplo n.º 3
0
    def __init__(self,
                 n_input_channels,
                 action_size,
                 n_hidden_layers=0,
                 n_hidden_channels=None,
                 min_action=None,
                 max_action=None,
                 bound_mean=False,
                 var_type='spherical',
                 nonlinearity=F.relu,
                 mean_wscale=1,
                 var_wscale=1,
                 var_bias=0,
                 min_var=0):

        self.n_input_channels = n_input_channels
        self.action_size = action_size
        self.n_hidden_layers = n_hidden_layers
        self.n_hidden_channels = n_hidden_channels
        self.min_action = min_action
        self.max_action = max_action
        self.bound_mean = bound_mean
        self.nonlinearity = nonlinearity
        self.min_var = min_var
        var_size = {'spherical': 1, 'diagonal': action_size}[var_type]

        self.hidden_layers = []
        if n_hidden_layers > 0:
            self.hidden_layers.append(
                L.Linear(n_input_channels, n_hidden_channels))
            for i in range(n_hidden_layers - 1):
                self.hidden_layers.append(
                    L.Linear(n_hidden_channels, n_hidden_channels))
            self.mean_layer = L.Linear(n_hidden_channels,
                                       action_size,
                                       initialW=LeCunNormal(mean_wscale))
            self.var_layer = L.Linear(n_hidden_channels,
                                      var_size,
                                      initialW=LeCunNormal(var_wscale),
                                      initial_bias=var_bias)
        else:
            self.mean_layer = L.Linear(n_input_channels,
                                       action_size,
                                       initialW=LeCunNormal(mean_wscale))
            self.var_layer = L.Linear(n_input_channels,
                                      var_size,
                                      initialW=LeCunNormal(var_wscale),
                                      initial_bias=var_bias)

        super().__init__(self.mean_layer, self.var_layer, *self.hidden_layers)
Exemplo n.º 4
0
    def __init__(self,
                 in_size,
                 out_size,
                 hidden_sizes,
                 normalize_input=True,
                 normalize_output=False,
                 nonlinearity=F.relu,
                 last_wscale=1):
        self.in_size = in_size
        self.out_size = out_size
        self.hidden_sizes = hidden_sizes
        self.normalize_input = normalize_input
        self.normalize_output = normalize_output
        self.nonlinearity = nonlinearity

        super().__init__()
        with self.init_scope():
            if normalize_input:
                self.input_bn = L.BatchNormalization(in_size)
                self.input_bn.avg_var[:] = 1

            if hidden_sizes:
                hidden_layers = []
                hidden_layers.append(LinearBN(in_size, hidden_sizes[0]))
                for hin, hout in zip(hidden_sizes, hidden_sizes[1:]):
                    hidden_layers.append(LinearBN(hin, hout))
                self.hidden_layers = chainer.ChainList(*hidden_layers)
                self.output = L.Linear(hidden_sizes[-1],
                                       out_size,
                                       initialW=LeCunNormal(last_wscale))
            else:
                self.output = L.Linear(in_size,
                                       out_size,
                                       initialW=LeCunNormal(last_wscale))

            if normalize_output:
                self.output_bn = L.BatchNormalization(out_size)
                self.output_bn.avg_var[:] = 1
 def __init__(self, n_dim_obs, n_dim_action, n_hidden_channels,
              n_hidden_layers, nonlinearity=F.relu, last_wscale=1.):
     self.n_input_channels = n_dim_obs + n_dim_action
     self.n_hidden_layers = n_hidden_layers
     self.n_hidden_channels = n_hidden_channels
     self.nonlinearity = nonlinearity
     super().__init__()
     with self.init_scope():
         self.fc = MLP(self.n_input_channels, n_hidden_channels,
                       [self.n_hidden_channels] * self.n_hidden_layers,
                       nonlinearity=nonlinearity,
                       )
         self.lstm = L.LSTM(n_hidden_channels, n_hidden_channels)
         self.out = L.Linear(n_hidden_channels, 1,
                             initialW=LeCunNormal(last_wscale))
Exemplo n.º 6
0
    def __init__(
        self,
        n_input_channels,
        action_size,
        n_hidden_layers=0,
        n_hidden_channels=None,
        min_action=None,
        max_action=None,
        bound_mean=False,
        var_type='spherical',
        nonlinearity=F.relu,
        mean_wscale=1,
        var_func=F.softplus,
        var_param_init=0,
    ):

        self.n_input_channels = n_input_channels
        self.action_size = action_size
        self.n_hidden_layers = n_hidden_layers
        self.n_hidden_channels = n_hidden_channels
        self.min_action = min_action
        self.max_action = max_action
        self.bound_mean = bound_mean
        self.nonlinearity = nonlinearity
        self.var_func = var_func
        var_size = {'spherical': 1, 'diagonal': action_size}[var_type]

        layers = []
        layers.append(L.Linear(n_input_channels, n_hidden_channels))
        for _ in range(n_hidden_layers - 1):
            layers.append(self.nonlinearity)
            layers.append(L.Linear(n_hidden_channels, n_hidden_channels))
        layers.append(self.nonlinearity)
        # The last layer is used to compute the mean
        layers.append(
            L.Linear(n_hidden_channels,
                     action_size,
                     initialW=LeCunNormal(mean_wscale)))

        if self.bound_mean:
            layers.append(
                lambda x: bound_by_tanh(x, self.min_action, self.max_action))

        super().__init__()
        with self.init_scope():
            self.hidden_layers = links.Sequence(*layers)
            self.var_param = chainer.Parameter(initializer=var_param_init,
                                               shape=(var_size, ))
    def __init__(self,
                 n_input_channels,
                 n_hidden_layers,
                 n_hidden_channels,
                 action_size,
                 min_action=None,
                 max_action=None,
                 bound_action=True,
                 nonlinearity=F.relu,
                 last_wscale=1.):
        self.n_input_channels = n_input_channels
        self.n_hidden_layers = n_hidden_layers
        self.n_hidden_channels = n_hidden_channels
        self.action_size = action_size
        self.min_action = min_action
        self.max_action = max_action
        self.bound_action = bound_action

        if self.bound_action:

            def action_filter(x):
                return bound_by_tanh(x, self.min_action, self.max_action)
        else:
            action_filter = None

        model = chainer.Chain(
            fc=MLP(
                self.n_input_channels,
                n_hidden_channels,
                (self.n_hidden_channels, ) * self.n_hidden_layers,
                nonlinearity=nonlinearity,
            ),
            lstm=L.LSTM(n_hidden_channels, n_hidden_channels),
            out=L.Linear(n_hidden_channels,
                         action_size,
                         initialW=LeCunNormal(last_wscale)),
        )

        def model_call(model, x):
            h = nonlinearity(model.fc(x))
            h = model.lstm(h)
            h = model.out(h)
            return h

        super().__init__(model=model,
                         model_call=model_call,
                         action_filter=action_filter)
Exemplo n.º 8
0
def create_acer_agent(env):
    #our observation space dimension of malware
    obs_dim = env.observation_space.shape[0]
    #the list of actions that we can perform on the malware
    n_actions = env.action_space.n
    #our acer network
    #consists of pi (our policy) and our q (our q function)
    model = acer.ACERSeparateModel(
        pi=links.Sequence(L.Linear(obs_dim, 1024,
                                   initialW=LeCunNormal(1e-3)), F.relu,
                          L.Linear(1024, 512,
                                   initialW=LeCunNormal(1e-3)), F.relu,
                          L.Linear(512, n_actions, initialW=LeCunNormal(1e-3)),
                          SoftmaxDistribution),
        q=links.Sequence(L.Linear(obs_dim, 1024, initialW=LeCunNormal(1e-3)),
                         F.relu, L.Linear(1024,
                                          512,
                                          initialW=LeCunNormal(1e-3)), F.relu,
                         L.Linear(512, n_actions, initialW=LeCunNormal(1e-3)),
                         DiscreteActionValue),
    )
    #optimizer for the acer
    opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-2, alpha=0.99)
    opt.setup(model)
    #hook to the chainer model
    opt.add_hook(chainer.optimizer.GradientClipping(40))

    replay_buffer = EpisodicReplayBuffer(128)
    #the agent itself, params from original file
    agent = acer.ACER(
        model,
        opt,
        gamma=0.95,  # reward discount factor
        t_max=32,  # update the model after this many local steps
        replay_buffer=replay_buffer,
        n_times_replay=
        4,  # number of times experience replay is repeated for each update
        replay_start_size=
        64,  # don't start replay unless we have this many experiences in the buffer
        disable_online_update=True,  # rely only on experience buffer
        use_trust_region=True,  # enable trust region policy optimiztion
        trust_region_delta=0.1,  # a parameter for TRPO
        truncation_threshold=5.0,  # truncate large importance weights
        beta=1e-2,  # entropy regularization parameter
        phi=lambda obs: obs.astype(np.float32, copy=False))

    return agent
    def __init__(self,
                 n_dim_obs,
                 n_dim_action,
                 n_hidden_channels,
                 n_hidden_layers,
                 nonlinearity=F.relu,
                 last_wscale=1):
        self.n_input_channels = n_dim_obs + n_dim_action
        self.n_hidden_layers = n_hidden_layers
        self.n_hidden_channels = n_hidden_channels
        self.nonlinearity = nonlinearity

        layers = []
        assert self.n_hidden_layers >= 1
        layers.append(L.Linear(self.n_input_channels, self.n_hidden_channels))
        for i in range(self.n_hidden_layers - 1):
            layers.append(
                L.Linear(self.n_hidden_channels, self.n_hidden_channels))
        layers.append(
            L.Linear(self.n_hidden_channels,
                     1,
                     initialW=LeCunNormal(last_wscale)))
        super().__init__(*layers)
        self.output = layers[-1]
Exemplo n.º 10
0
def main():
    import logging

    parser = argparse.ArgumentParser()
    parser.add_argument('processes', type=int)
    parser.add_argument('--env',
                        type=str,
                        default='MsPacman-ramNoFrameskip-v4')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 32)')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--t-max', type=int, default=50)
    parser.add_argument('--n-times-replay', type=int, default=4)
    parser.add_argument('--n-hidden-channels', type=int, default=100)
    parser.add_argument('--n-hidden-layers', type=int, default=2)
    parser.add_argument('--replay-capacity', type=int, default=5000)
    parser.add_argument('--replay-start-size', type=int, default=10**3)
    parser.add_argument('--disable-online-update', action='store_true')
    parser.add_argument('--beta', type=float, default=1e-2)
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--steps', type=int, default=5 * 10**6)
    parser.add_argument('--eval-interval', type=int, default=5 * 10**4)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--reward-scale-factor', type=float, default=1e-2)
    parser.add_argument('--rmsprop-epsilon', type=float, default=1e-2)
    parser.add_argument('--render', action='store_true', default=False)
    parser.add_argument('--lr', type=float, default=7e-4)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--logger-level', type=int, default=logging.DEBUG)
    parser.add_argument('--monitor', action='store_true')
    parser.add_argument('--truncation-threshold', type=float, default=5)
    parser.add_argument('--trust-region-delta', type=float, default=0.1)
    args = parser.parse_args()

    logging.basicConfig(level=args.logger_level)

    # Set a random seed used in ChainerRL.
    # If you use more than one processes, the results will be no longer
    # deterministic even with the same random seed.
    misc.set_random_seed(args.seed)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.processes) + args.seed * args.processes
    assert process_seeds.max() < 2**32

    #args.outdir = experiments.prepare_output_dir(args, args.outdir)

    def make_env(process_idx, test):
        env = gym.make(args.env)
        # Use different random seeds for train and test envs
        process_seed = int(process_seeds[process_idx])
        env_seed = 2**32 - 1 - process_seed if test else process_seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if args.monitor and process_idx == 0:
            env = gym.wrappers.Monitor(env, args.outdir)
        if not test:
            # Scale rewards (and thus returns) to a reasonable range so that
            # training is easier
            env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor)
        if args.render and process_idx == 0 and not test:
            env = chainerrl.wrappers.Render(env)
        env.next_agent_num = 1
        return env

    sample_env = gym.make(args.env)
    timestep_limit = sample_env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = sample_env.observation_space
    action_space = sample_env.action_space

    if isinstance(action_space, spaces.Box):
        model = acer.ACERSDNSeparateModel(
            pi=policies.FCGaussianPolicy(
                obs_space.low.size,
                action_space.low.size,
                n_hidden_channels=args.n_hidden_channels,
                n_hidden_layers=args.n_hidden_layers,
                bound_mean=True,
                min_action=action_space.low,
                max_action=action_space.high),
            v=v_functions.FCVFunction(obs_space.low.size,
                                      n_hidden_channels=args.n_hidden_channels,
                                      n_hidden_layers=args.n_hidden_layers),
            adv=q_functions.FCSAQFunction(
                obs_space.low.size,
                action_space.low.size,
                n_hidden_channels=args.n_hidden_channels // 4,
                n_hidden_layers=args.n_hidden_layers),
        )
    else:
        model = acer.ACERSeparateModel(
            pi=links.Sequence(
                L.Linear(obs_space.low.size, args.n_hidden_channels), F.relu,
                L.Linear(args.n_hidden_channels,
                         action_space.n,
                         initialW=LeCunNormal(1e-3)), SoftmaxDistribution),
            q=links.Sequence(
                L.Linear(obs_space.low.size, args.n_hidden_channels), F.relu,
                L.Linear(args.n_hidden_channels,
                         action_space.n,
                         initialW=LeCunNormal(1e-3)), DiscreteActionValue),
        )

    opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=4e-3, alpha=0.99)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))

    replay_buffer = EpisodicReplayBuffer(args.replay_capacity)
    agent = acer.ACER(model,
                      opt,
                      t_max=args.t_max,
                      gamma=0.99,
                      replay_buffer=replay_buffer,
                      n_times_replay=args.n_times_replay,
                      replay_start_size=args.replay_start_size,
                      disable_online_update=args.disable_online_update,
                      use_trust_region=True,
                      trust_region_delta=args.trust_region_delta,
                      truncation_threshold=args.truncation_threshold,
                      beta=args.beta)
    if args.load:
        agent.load(args.load)

    if args.demo:
        env = make_env(0, True)
        eval_stats = experiments.eval_performance(
            env=env,
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:

        def hook(env, agent, step):
            if step > env.next_agent_num * 5 * 10**5:
                agent.save('results/agent_' + str(env.next_agent_num))
                env.next_agent_num += 1

        experiments.train_agent_async(agent=agent,
                                      outdir=args.outdir,
                                      processes=args.processes,
                                      make_env=make_env,
                                      profile=args.profile,
                                      steps=args.steps,
                                      eval_n_runs=args.eval_n_runs,
                                      eval_interval=args.eval_interval,
                                      max_episode_len=timestep_limit,
                                      global_step_hooks=(hook, ))
Exemplo n.º 11
0
def main():
    import logging

    parser = argparse.ArgumentParser()
    parser.add_argument('processes', type=int)
    parser.add_argument('--env', type=str, default='CartPole-v0')
    parser.add_argument('--seed', type=int, default=None)
    parser.add_argument('--outdir', type=str, default=None)
    parser.add_argument('--t-max', type=int, default=50)
    parser.add_argument('--n-times-replay', type=int, default=4)
    parser.add_argument('--n-hidden-channels', type=int, default=100)
    parser.add_argument('--n-hidden-layers', type=int, default=2)
    parser.add_argument('--replay-capacity', type=int, default=5000)
    parser.add_argument('--replay-start-size', type=int, default=10**3)
    parser.add_argument('--disable-online-update', action='store_true')
    parser.add_argument('--beta', type=float, default=1e-2)
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--steps', type=int, default=8 * 10**7)
    parser.add_argument('--eval-interval', type=int, default=10**5)
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--reward-scale-factor', type=float, default=1e-2)
    parser.add_argument('--rmsprop-epsilon', type=float, default=1e-2)
    parser.add_argument('--render', action='store_true', default=False)
    parser.add_argument('--lr', type=float, default=7e-4)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--logger-level', type=int, default=logging.DEBUG)
    parser.add_argument('--monitor', action='store_true')
    parser.add_argument('--truncation-threshold', type=float, default=5)
    parser.add_argument('--trust-region-delta', type=float, default=0.1)
    args = parser.parse_args()

    logging.basicConfig(level=args.logger_level)

    if args.seed is not None:
        misc.set_random_seed(args.seed)

    args.outdir = experiments.prepare_output_dir(args, args.outdir)

    def make_env(process_idx, test):
        env = gym.make(args.env)
        if args.monitor and process_idx == 0:
            env = gym.wrappers.Monitor(env, args.outdir)
        # Scale rewards observed by agents
        if not test:
            misc.env_modifiers.make_reward_filtered(
                env, lambda x: x * args.reward_scale_factor)
        if args.render and process_idx == 0 and not test:
            misc.env_modifiers.make_rendered(env)
        return env

    sample_env = gym.make(args.env)
    timestep_limit = sample_env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = sample_env.observation_space
    action_space = sample_env.action_space

    if isinstance(action_space, spaces.Box):
        model = acer.ACERSDNSeparateModel(
            pi=policies.FCGaussianPolicy(
                obs_space.low.size,
                action_space.low.size,
                n_hidden_channels=args.n_hidden_channels,
                n_hidden_layers=args.n_hidden_layers,
                bound_mean=True,
                min_action=action_space.low,
                max_action=action_space.high),
            v=v_functions.FCVFunction(obs_space.low.size,
                                      n_hidden_channels=args.n_hidden_channels,
                                      n_hidden_layers=args.n_hidden_layers),
            adv=q_functions.FCSAQFunction(
                obs_space.low.size,
                action_space.low.size,
                n_hidden_channels=args.n_hidden_channels // 4,
                n_hidden_layers=args.n_hidden_layers),
        )
    else:
        model = acer.ACERSeparateModel(
            pi=links.Sequence(
                L.Linear(obs_space.low.size, args.n_hidden_channels), F.relu,
                L.Linear(args.n_hidden_channels,
                         action_space.n,
                         initialW=LeCunNormal(1e-3)), SoftmaxDistribution),
            q=links.Sequence(
                L.Linear(obs_space.low.size, args.n_hidden_channels), F.relu,
                L.Linear(args.n_hidden_channels,
                         action_space.n,
                         initialW=LeCunNormal(1e-3)), DiscreteActionValue),
        )

    opt = rmsprop_async.RMSpropAsync(lr=args.lr,
                                     eps=args.rmsprop_epsilon,
                                     alpha=0.99)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))

    replay_buffer = EpisodicReplayBuffer(args.replay_capacity)
    agent = acer.ACER(model,
                      opt,
                      t_max=args.t_max,
                      gamma=0.99,
                      replay_buffer=replay_buffer,
                      n_times_replay=args.n_times_replay,
                      replay_start_size=args.replay_start_size,
                      disable_online_update=args.disable_online_update,
                      use_trust_region=True,
                      trust_region_delta=args.trust_region_delta,
                      truncation_threshold=args.truncation_threshold,
                      beta=args.beta,
                      phi=phi)
    if args.load:
        agent.load(args.load)

    if args.demo:
        env = make_env(0, True)
        eval_stats = experiments.eval_performance(
            env=env,
            agent=agent,
            n_runs=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_async(agent=agent,
                                      outdir=args.outdir,
                                      processes=args.processes,
                                      make_env=make_env,
                                      profile=args.profile,
                                      steps=args.steps,
                                      eval_n_runs=args.eval_n_runs,
                                      eval_interval=args.eval_interval,
                                      max_episode_len=timestep_limit)