def run(agent: DDPG, steps: int, save_dir: str) -> float:
    sim_distance_file = DataCSVSaver(os.path.join(save_dir, "distance.txt"), ("step", "distance"))
    sim_somite_phase_diffs_file = DataCSVSaver(os.path.join(save_dir, "somite_phase_diffs.txt"), ["step"] + ["phi_{}".format(i) for i in range(config.oscillators)])
    sim_gripper_phase_diffs_file = DataCSVSaver(os.path.join(save_dir, "gripper_phase_diffs.txt"), ["step"] + ["phi_{}".format(i) for i in range(config.grippers)])
    sim_somite_phases_file = DataCSVSaver(os.path.join(save_dir, "somite_phases.txt"), ["step"] + ["phi_{}".format(i) for i in range(config.oscillators)])
    sim_gripper_phases_file = DataCSVSaver(os.path.join(save_dir, "gripper_phases.txt"), ["step"] + ["phi_{}".format(i) for i in range(config.grippers)])
    sim_somite_actions_file = DataCSVSaver(os.path.join(save_dir, "somite_actions.txt"), ["step"] + ["action_{}".format(i) for i in range(config.oscillators)])
    sim_gripper_actions_file = DataCSVSaver(os.path.join(save_dir, "gripper_actions.txt"), ["step"] + ["action_{}".format(i) for i in range(config.grippers)])
    sim_frictions_file = DataCSVSaver(os.path.join(save_dir, "frictions.txt"), ["step"] + ["friction_{}".format(i) for i in range(config.somites)])
    sim_tension_file = DataCSVSaver(os.path.join(save_dir, "tensions.txt"), ["step"] + ["tension_{}".format(i) for i in range(config.somites - 2)])

    caterpillar = Caterpillar(config.somites, config.oscillators_list, config.grippers_list, config.caterpillar_params)
    locomotion_distance = utils.locomotion_distance_logger(caterpillar)
    for step in range(steps):
        obs, somite_phases, gripper_phases = observe(caterpillar)
        actions = agent.act(obs)
        feedbacks_somite, feedbacks_gripper = actions[:config.oscillators], actions[config.oscillators:]
        caterpillar.step_with_feedbacks(config.params["time_delta"], tuple(feedbacks_somite), tuple(feedbacks_gripper))

        frictions, tensions, _, _ = np.split(
            obs, [config.somites, config.somites * 2 - 2, config.somites * 2 - 2 + config.oscillators])
        sim_distance_file.append_data(step, locomotion_distance(caterpillar))
        sim_somite_phase_diffs_file.append_data(step, *utils.phase_diffs(np.array(somite_phases)))
        sim_gripper_phase_diffs_file.append_data(step, *utils.phase_diffs(np.array(gripper_phases)))
        sim_somite_phases_file.append_data(step, *utils.mod2pi(np.array(somite_phases)))
        sim_gripper_phases_file.append_data(step, *utils.mod2pi(np.array(gripper_phases)))
        sim_frictions_file.append_data(step, *frictions)
        sim_tension_file.append_data(step, *tensions)
        sim_somite_actions_file.append_data(step, *feedbacks_somite)
        sim_gripper_actions_file.append_data(step, *feedbacks_gripper)

    caterpillar.save_simulation("{}/render.json".format(save_dir))
    return locomotion_distance(caterpillar)
Пример #2
0
 def make_ddpg_agent(self, env, model, actor_opt, critic_opt, explorer,
                     rbuf, gpu):
     return DDPG(model,
                 actor_opt,
                 critic_opt,
                 rbuf,
                 gpu=gpu,
                 gamma=0.9,
                 explorer=explorer,
                 replay_start_size=100,
                 target_update_method='soft',
                 target_update_interval=1,
                 episodic_update=False)
def build_agent() -> DDPG:
    # observation:
    # friction on each somite (#somite)
    # tension on each somite except for both ends (#somite - 2)
    # cos(somite phases), sin(somites phases) (#oscillator x 2)
    # cos(gripper phases), sin(gripper phases) (#gripper x 2)
    obs_size = config.somites + (config.somites - 2) + config.oscillators*2 + config.grippers*2
    # actions: feedbacks to somite oscillators, feedbacks to gripper oscillators
    action_size = config.oscillators + config.grippers

    q_func = q_functions.FCBNLateActionSAQFunction(
        obs_size,
        action_size,
        n_hidden_channels=6,
        n_hidden_layers=2,
        normalize_input=True)
    pi = policy.FCBNDeterministicPolicy(
        obs_size,
        action_size=action_size,
        n_hidden_channels=6,
        n_hidden_layers=2,
        min_action=-F_OUTPUT_BOUND,
        max_action=F_OUTPUT_BOUND,
        bound_action=True,
        normalize_input=True)
    model = DDPGModel(q_func=q_func, policy=pi)
    opt_actor = optimizers.Adam()
    opt_actor.setup(model['policy'])
    opt_critic = optimizers.Adam()
    opt_critic.setup(model['q_function'])
    opt_actor.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_actor')
    opt_critic.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_critic')

    rep_buf = replay_buffer.ReplayBuffer(capacity=1 * 10 ** 5)
    explorer = explorers.AdditiveOU(sigma=OU_SIGMA)
    phi = lambda x: x.astype(np.float32)

    agent = DDPG(model, opt_actor, opt_critic, rep_buf, gamma=GAMMA, explorer=explorer,
                 phi=lambda x: x.astype(np.float32), gpu=GPU, replay_start_size=10000)
    return agent
Пример #4
0
opt_actor.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_a')
opt_critic.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_c')

rbuf = replay_buffer.ReplayBuffer(replay_buffer_size)
ou_sigma = (action_space.high - action_space.low) * 0.2

explorer = explorers.AdditiveOU(sigma=ou_sigma)

# The agent
agent = DDPG(model,
             opt_actor,
             opt_critic,
             rbuf,
             gamma=gamma,
             explorer=explorer,
             replay_start_size=replay_start_size,
             target_update_method=target_update_method,
             target_update_interval=target_update_interval,
             update_interval=update_interval,
             soft_update_tau=soft_update_tau,
             n_times_update=number_of_update_times,
             phi=phi,
             minibatch_size=minibatch_size)


class ExpertDDPGAgent:
    def __init__(self, path):
        self.path = path

    def load_agent(self):
        agent.load(self.path)
        return agent
Пример #5
0
def main():
    import logging
    logging.basicConfig(level=logging.DEBUG)

    parser = argparse.ArgumentParser()
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--env', type=str, default='Humanoid-v2')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 32)')
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--final-exploration-steps', type=int, default=10**6)
    parser.add_argument('--actor-lr', type=float, default=1e-4)
    parser.add_argument('--critic-lr', type=float, default=1e-3)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--steps', type=int, default=10**7)
    parser.add_argument('--n-hidden-channels', type=int, default=300)
    parser.add_argument('--n-hidden-layers', type=int, default=3)
    parser.add_argument('--replay-start-size', type=int, default=5000)
    parser.add_argument('--n-update-times', type=int, default=1)
    parser.add_argument('--target-update-interval', type=int, default=1)
    parser.add_argument('--target-update-method',
                        type=str,
                        default='soft',
                        choices=['hard', 'soft'])
    parser.add_argument('--soft-update-tau', type=float, default=1e-2)
    parser.add_argument('--update-interval', type=int, default=4)
    parser.add_argument('--eval-n-runs', type=int, default=100)
    parser.add_argument('--eval-interval', type=int, default=10**5)
    parser.add_argument('--gamma', type=float, default=0.995)
    parser.add_argument('--minibatch-size', type=int, default=200)
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--demo', action='store_true')
    parser.add_argument('--use-bn', action='store_true', default=False)
    parser.add_argument('--monitor', action='store_true')
    parser.add_argument('--reward-scale-factor', type=float, default=1e-2)
    args = parser.parse_args()

    args.outdir = experiments.prepare_output_dir(args,
                                                 args.outdir,
                                                 argv=sys.argv)
    print('Output files are saved in {}'.format(args.outdir))

    # Set a random seed used in ChainerRL
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    def clip_action_filter(a):
        return np.clip(a, action_space.low, action_space.high)

    def reward_filter(r):
        return r * args.reward_scale_factor

    def make_env(test):
        env = gym.make(args.env)
        # Use different random seeds for train and test envs
        env_seed = 2**32 - 1 - args.seed if test else args.seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if args.monitor:
            env = chainerrl.wrappers.Monitor(env, args.outdir)
        if isinstance(env.action_space, spaces.Box):
            misc.env_modifiers.make_action_filtered(env, clip_action_filter)
        if not test:
            # Scale rewards (and thus returns) to a reasonable range so that
            # training is easier
            env = chainerrl.wrappers.ScaleReward(env, args.reward_scale_factor)
        if args.render and not test:
            env = chainerrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    timestep_limit = env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_size = np.asarray(env.observation_space.shape).prod()
    action_space = env.action_space

    action_size = np.asarray(action_space.shape).prod()
    if args.use_bn:
        q_func = q_functions.FCBNLateActionSAQFunction(
            obs_size,
            action_size,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers,
            normalize_input=True)
        pi = policy.FCBNDeterministicPolicy(
            obs_size,
            action_size=action_size,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers,
            min_action=action_space.low,
            max_action=action_space.high,
            bound_action=True,
            normalize_input=True)
    else:
        q_func = q_functions.FCSAQFunction(
            obs_size,
            action_size,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers)
        pi = policy.FCDeterministicPolicy(
            obs_size,
            action_size=action_size,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers,
            min_action=action_space.low,
            max_action=action_space.high,
            bound_action=True)
    model = DDPGModel(q_func=q_func, policy=pi)
    opt_a = optimizers.Adam(alpha=args.actor_lr)
    opt_c = optimizers.Adam(alpha=args.critic_lr)
    opt_a.setup(model['policy'])
    opt_c.setup(model['q_function'])
    opt_a.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_a')
    opt_c.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_c')

    rbuf = replay_buffer.ReplayBuffer(5 * 10**5)

    def random_action():
        a = action_space.sample()
        if isinstance(a, np.ndarray):
            a = a.astype(np.float32)
        return a

    ou_sigma = (action_space.high - action_space.low) * 0.2
    explorer = explorers.AdditiveOU(sigma=ou_sigma)
    agent = DDPG(model,
                 opt_a,
                 opt_c,
                 rbuf,
                 gamma=args.gamma,
                 explorer=explorer,
                 replay_start_size=args.replay_start_size,
                 target_update_method=args.target_update_method,
                 target_update_interval=args.target_update_interval,
                 update_interval=args.update_interval,
                 soft_update_tau=args.soft_update_tau,
                 n_times_update=args.n_update_times,
                 gpu=args.gpu,
                 minibatch_size=args.minibatch_size)

    if len(args.load) > 0:
        agent.load(args.load)

    eval_env = make_env(test=True)
    if args.demo:
        eval_stats = experiments.eval_performance(
            env=eval_env,
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_env=eval_env,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            train_max_episode_len=timestep_limit)
Пример #6
0
def main():
    import logging
    logging.basicConfig(level=logging.DEBUG)

    parser = argparse.ArgumentParser()
    parser.add_argument('--outdir', type=str, default='out')
    parser.add_argument('--env', type=str, default='Humanoid-v1')
    parser.add_argument('--seed', type=int, default=None)
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--final-exploration-steps', type=int, default=10**6)
    parser.add_argument('--actor-lr', type=float, default=1e-4)
    parser.add_argument('--critic-lr', type=float, default=1e-3)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--steps', type=int, default=10**7)
    parser.add_argument('--n-hidden-channels', type=int, default=300)
    parser.add_argument('--n-hidden-layers', type=int, default=3)
    parser.add_argument('--replay-start-size', type=int, default=5000)
    parser.add_argument('--n-update-times', type=int, default=1)
    parser.add_argument('--target-update-frequency', type=int, default=1)
    parser.add_argument('--target-update-method',
                        type=str,
                        default='soft',
                        choices=['hard', 'soft'])
    parser.add_argument('--soft-update-tau', type=float, default=1e-2)
    parser.add_argument('--update-frequency', type=int, default=4)
    parser.add_argument('--eval-n-runs', type=int, default=100)
    parser.add_argument('--eval-frequency', type=int, default=10**5)
    parser.add_argument('--gamma', type=float, default=0.995)
    parser.add_argument('--minibatch-size', type=int, default=200)
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--demo', action='store_true')
    parser.add_argument('--use-bn', action='store_true', default=False)
    parser.add_argument('--monitor', action='store_true')
    parser.add_argument('--reward-scale-factor', type=float, default=1e-2)
    args = parser.parse_args()

    args.outdir = experiments.prepare_output_dir(args,
                                                 args.outdir,
                                                 argv=sys.argv)
    print('Output files are saved in {}'.format(args.outdir))

    if args.seed is not None:
        misc.set_random_seed(args.seed)

    def clip_action_filter(a):
        return np.clip(a, action_space.low, action_space.high)

    def reward_filter(r):
        return r * args.reward_scale_factor

    def make_env():
        env = gym.make(args.env)
        if args.monitor:
            env = gym.wrappers.Monitor(env, args.outdir)
        if isinstance(env.action_space, spaces.Box):
            misc.env_modifiers.make_action_filtered(env, clip_action_filter)
        misc.env_modifiers.make_reward_filtered(env, reward_filter)
        if args.render:
            misc.env_modifiers.make_rendered(env)

        def __exit__(self, *args):
            pass

        env.__exit__ = __exit__
        return env

    env = make_env()
    timestep_limit = env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_size = np.asarray(env.observation_space.shape).prod()
    action_space = env.action_space

    action_size = np.asarray(action_space.shape).prod()
    if args.use_bn:
        q_func = q_functions.FCBNLateActionSAQFunction(
            obs_size,
            action_size,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers,
            normalize_input=True)
        pi = policy.FCBNDeterministicPolicy(
            obs_size,
            action_size=action_size,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers,
            min_action=action_space.low,
            max_action=action_space.high,
            bound_action=True,
            normalize_input=True)
    else:
        q_func = q_functions.FCSAQFunction(
            obs_size,
            action_size,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers)
        pi = policy.FCDeterministicPolicy(
            obs_size,
            action_size=action_size,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers,
            min_action=action_space.low,
            max_action=action_space.high,
            bound_action=True)
    model = DDPGModel(q_func=q_func, policy=pi)
    opt_a = optimizers.Adam(alpha=args.actor_lr)
    opt_c = optimizers.Adam(alpha=args.critic_lr)
    opt_a.setup(model['policy'])
    opt_c.setup(model['q_function'])
    opt_a.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_a')
    opt_c.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_c')

    rbuf = replay_buffer.ReplayBuffer(5 * 10**5)

    def phi(obs):
        return obs.astype(np.float32)

    def random_action():
        a = action_space.sample()
        if isinstance(a, np.ndarray):
            a = a.astype(np.float32)
        return a

    ou_sigma = (action_space.high - action_space.low) * 0.2
    explorer = explorers.AdditiveOU(sigma=ou_sigma)
    agent = DDPG(model,
                 opt_a,
                 opt_c,
                 rbuf,
                 gamma=args.gamma,
                 explorer=explorer,
                 replay_start_size=args.replay_start_size,
                 target_update_method=args.target_update_method,
                 target_update_frequency=args.target_update_frequency,
                 update_frequency=args.update_frequency,
                 soft_update_tau=args.soft_update_tau,
                 n_times_update=args.n_update_times,
                 phi=phi,
                 gpu=args.gpu,
                 minibatch_size=args.minibatch_size)
    agent.logger.setLevel(logging.DEBUG)

    if len(args.load) > 0:
        agent.load(args.load)

    if args.demo:
        mean, median, stdev = experiments.eval_performance(
            env=env,
            agent=agent,
            n_runs=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev'.format(
            args.eval_n_runs, mean, median, stdev))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_runs=args.eval_n_runs,
            eval_frequency=args.eval_frequency,
            outdir=args.outdir,
            max_episode_len=timestep_limit)
Пример #7
0
opt_critic.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_c')
rbuf = replay_buffer.ReplayBuffer(5 * 10**5)
ou_sigma = (action_space.high - action_space.low) * 0.2

#Explorer
explorer = explorers.AdditiveOU(sigma=ou_sigma)

#Agent
agent = DDPG(
    model,
    opt_actor,
    opt_critic,
    rbuf,
    gamma=args.gamma,
    explorer=explorer,
    replay_start_size=5000,
    target_update_method='soft',
    target_update_interval=1,
    update_interval=4,
    soft_update_tau=1e-2,
    n_times_update=1,
    phi=phi,
    minibatch_size=128  #args.minibatch_size
)

# Cargar el modelo guardado
agent.load("DDPG_last_model_rew8k")

for i in range(args.test_episodes):
    obs = env.reset()
    done = False
    reward_Alan = 0
actor_optimizer = chainer.optimizers.Adam(eps=1e-2)
actor_optimizer.setup(model['q_function'])
actor_optimizer.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_a')
critic_optimizer = chainer.optimizers.Adam(eps=1e-2)
critic_optimizer.setup(model['policy'])
critic_optimizer.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_c')
replay_buffer = chainerrl.replay_buffer.ReplayBuffer(capacity=10**6)
gamma = 0.99
explorer = chainerrl.explorers.ConstantEpsilonGreedy(
    epsilon=0.3, random_action_func=env.action_space.sample)
phi = lambda x: x.astype(np.float32, copy=False)
agent = DDPG(model,
             actor_optimizer,
             critic_optimizer,
             replay_buffer,
             gamma,
             explorer=explorer,
             replay_start_size=500,
             update_interval=1,
             target_update_interval=100,
             phi=phi)

#エピソード(試行)の数
n_episodes = 2000
#1時間ステップに行う試行数
max_episode_len = 200

#====学習フェーズ====
for i in range(1, n_episodes + 1):
    env.reset()
    obs, reward, done, _ = env.step(env.action_space.sample())
    R = 0  # return (sum of rewards)
Пример #9
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--env',
                        type=str,
                        default='Hopper-v2',
                        help='OpenAI Gym MuJoCo env to perform algorithm on.')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 32)')
    parser.add_argument('--gpu',
                        type=int,
                        default=0,
                        help='GPU to use, set to -1 if no GPU.')
    parser.add_argument('--load',
                        type=str,
                        default='',
                        help='Directory to load agent from.')
    parser.add_argument('--steps',
                        type=int,
                        default=10**6,
                        help='Total number of timesteps to train the agent.')
    parser.add_argument('--eval-n-runs',
                        type=int,
                        default=10,
                        help='Number of episodes run for each evaluation.')
    parser.add_argument('--eval-interval',
                        type=int,
                        default=5000,
                        help='Interval in timesteps between evaluations.')
    parser.add_argument('--replay-start-size',
                        type=int,
                        default=10000,
                        help='Minimum replay buffer size before ' +
                        'performing gradient updates.')
    parser.add_argument('--batch-size',
                        type=int,
                        default=100,
                        help='Minibatch size')
    parser.add_argument('--render',
                        action='store_true',
                        help='Render env states in a GUI window.')
    parser.add_argument('--demo',
                        action='store_true',
                        help='Just run evaluation, not training.')
    parser.add_argument('--monitor',
                        action='store_true',
                        help='Wrap env with gym.wrappers.Monitor.')
    parser.add_argument('--logger-level',
                        type=int,
                        default=logging.INFO,
                        help='Level of the root logger.')
    args = parser.parse_args()

    logging.basicConfig(level=args.logger_level)

    args.outdir = experiments.prepare_output_dir(args,
                                                 args.outdir,
                                                 argv=sys.argv)
    print('Output files are saved in {}'.format(args.outdir))

    # Set a random seed used in ChainerRL
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    def make_env(test):
        env = gym.make(args.env)
        # Unwrap TimiLimit wrapper
        assert isinstance(env, gym.wrappers.TimeLimit)
        env = env.env
        # Use different random seeds for train and test envs
        env_seed = 2**32 - 1 - args.seed if test else args.seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = chainerrl.wrappers.CastObservationToFloat32(env)
        if args.monitor:
            env = gym.wrappers.Monitor(env, args.outdir)
        if args.render and not test:
            env = chainerrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    timestep_limit = env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    obs_space = env.observation_space
    action_space = env.action_space
    print('Observation space:', obs_space)
    print('Action space:', action_space)

    action_size = action_space.low.size

    winit = chainer.initializers.LeCunUniform(3**-0.5)
    q_func = chainer.Sequential(
        concat_obs_and_action,
        L.Linear(None, 400, initialW=winit),
        F.relu,
        L.Linear(None, 300, initialW=winit),
        F.relu,
        L.Linear(None, 1, initialW=winit),
    )
    policy = chainer.Sequential(
        L.Linear(None, 400, initialW=winit),
        F.relu,
        L.Linear(None, 300, initialW=winit),
        F.relu,
        L.Linear(None, action_size, initialW=winit),
        F.tanh,
        chainerrl.distribution.ContinuousDeterministicDistribution,
    )
    model = DDPGModel(q_func=q_func, policy=policy)

    # Draw the computational graph and save it in the output directory.
    fake_obs = chainer.Variable(model.xp.zeros_like(obs_space.low,
                                                    dtype=np.float32)[None],
                                name='observation')
    fake_action = chainer.Variable(model.xp.zeros_like(action_space.low,
                                                       dtype=np.float32)[None],
                                   name='action')
    chainerrl.misc.draw_computational_graph([policy(fake_obs)],
                                            os.path.join(
                                                args.outdir, 'policy'))
    chainerrl.misc.draw_computational_graph([q_func(fake_obs, fake_action)],
                                            os.path.join(
                                                args.outdir, 'q_func'))

    opt_a = optimizers.Adam()
    opt_c = optimizers.Adam()
    opt_a.setup(model['policy'])
    opt_c.setup(model['q_function'])

    rbuf = replay_buffer.ReplayBuffer(10**6)

    explorer = explorers.AdditiveGaussian(scale=0.1,
                                          low=action_space.low,
                                          high=action_space.high)

    def burnin_action_func():
        """Select random actions until model is updated one or more times."""
        return np.random.uniform(action_space.low,
                                 action_space.high).astype(np.float32)

    # Hyperparameters in http://arxiv.org/abs/1802.09477
    agent = DDPG(
        model,
        opt_a,
        opt_c,
        rbuf,
        gamma=0.99,
        explorer=explorer,
        replay_start_size=args.replay_start_size,
        target_update_method='soft',
        target_update_interval=1,
        update_interval=1,
        soft_update_tau=5e-3,
        n_times_update=1,
        gpu=args.gpu,
        minibatch_size=args.batch_size,
        burnin_action_func=burnin_action_func,
    )

    if len(args.load) > 0:
        agent.load(args.load)

    eval_env = make_env(test=True)
    if args.demo:
        eval_stats = experiments.eval_performance(
            env=eval_env,
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_env=eval_env,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            train_max_episode_len=timestep_limit)
Пример #10
0
ou_sigma = (env.action_space.high - env.action_space.low) * 0.2
#explorer = explorers.AdditiveOU(sigma=ou_sigma)
explorer = chainerrl.explorers.ConstantEpsilonGreedy(
    epsilon=0.2, random_action_func=env.action_space.sample)

phi = lambda x: np.array(x).astype(np.float32, copy=False
                                   )  #COnverir datatype de la observación

# Agent
agent = DDPG(model,
             opt_actor,
             opt_critic,
             rbuf,
             gamma=args.gamma,
             explorer=explorer,
             replay_start_size=5000,
             target_update_method='soft',
             target_update_interval=1,
             update_interval=4,
             soft_update_tau=1e-2,
             n_times_update=1,
             phi=phi,
             minibatch_size=128)


def graph_reward(reward, saveas, epochs):
    name = saveas + '.png'
    episodes = np.linspace(0, epochs, epochs)
    plt.figure()
    plt.plot(episodes, reward, 'cadetblue', label='DDPG')
    plt.legend()
    plt.xlabel("Episode")
Пример #11
0
def main():
    import logging
    logging.basicConfig(level=logging.DEBUG)

    parser = argparse.ArgumentParser()
    parser.add_argument('--outdir', type=str, default='results',
                        help='Directory path to save output files.'
                             ' If it does not exist, it will be created.')
    parser.add_argument('--env', type=str, default='FetchPickAndPlace-v1')
    parser.add_argument('--seed', type=int, default=0,
                        help='Random seed [0, 2 ** 32)')
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--final-exploration-steps',
                        type=int, default=10 ** 6)
    parser.add_argument('--actor-lr', type=float, default=1e-3)
    parser.add_argument('--critic-lr', type=float, default=1e-3)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--steps', type=int, default=200 * 50 * 16 * 50)
    parser.add_argument('--n-hidden-channels', type=int, default=64)
    parser.add_argument('--n-hidden-layers', type=int, default=3)
    parser.add_argument('--replay-start-size', type=int, default=10000)
    parser.add_argument('--n-update-times', type=int, default=40)
    parser.add_argument('--target-update-interval',
                        type=int, default=16 * 50)
    parser.add_argument('--target-update-method',
                        type=str, default='soft', choices=['hard', 'soft'])
    parser.add_argument('--soft-update-tau', type=float, default=1 - 0.95)
    parser.add_argument('--update-interval', type=int, default=16 * 50)
    parser.add_argument('--eval-n-runs', type=int, default=30)
    parser.add_argument('--eval-interval', type=int, default=50 * 16 * 50)
    parser.add_argument('--gamma', type=float, default=0.98)
    parser.add_argument('--minibatch-size', type=int, default=128)
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--demo', action='store_true')
    parser.add_argument('--monitor', action='store_true')
    parser.add_argument('--epsilon', type=float, default=0.05)
    parser.add_argument('--noise-std', type=float, default=0.05)
    parser.add_argument('--clip-threshold', type=float, default=5.0)
    parser.add_argument('--num-envs', type=int, default=1)
    args = parser.parse_args()

    args.outdir = experiments.prepare_output_dir(
        args, args.outdir, argv=sys.argv)
    print('Output files are saved in {}'.format(args.outdir))

    # Set a random seed used in ChainerRL
    misc.set_random_seed(args.seed, gpus=(args.gpu,))

    def clip_action_filter(a):
        return np.clip(a, action_space.low, action_space.high)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs
    assert process_seeds.max() < 2 ** 32

    def make_env(idx, test):
        env = gym.make(args.env)
        # Use different random seeds for train and test envs
        process_seed = int(process_seeds[idx])
        env_seed = 2 ** 32 - 1 - process_seed if test else process_seed
        env.seed(env_seed)
        if args.monitor:
            env = gym.wrappers.Monitor(env, args.outdir)
        if isinstance(env.action_space, spaces.Box):
            misc.env_modifiers.make_action_filtered(env, clip_action_filter)
        if args.render and not test:
            env = chainerrl.wrappers.Render(env)
        if test:
            env = HEREnvWrapper(env, args.outdir)
        return env

    def make_batch_env(test):
        return chainerrl.envs.MultiprocessVectorEnv(
            [(lambda: make_env(idx, test))
             for idx, env in enumerate(range(args.num_envs))])

    sample_env = make_env(0, test=False)

    def reward_function(state, action, goal):
        return sample_env.compute_reward(achieved_goal=state['achieved_goal'],
                                  desired_goal=goal,
                                  info=None)

    timestep_limit = sample_env.spec.tags.get(
        'wrapper_config.TimeLimit.max_episode_steps')
    space_dict = sample_env.observation_space.spaces
    observation_space = space_dict['observation']
    goal_space = space_dict['desired_goal']
    obs_size = np.asarray(observation_space.shape).prod()
    goal_size = np.asarray(goal_space.shape).prod()
    action_space = sample_env.action_space

    action_size = np.asarray(action_space.shape).prod()    
    q_func = q_functions.FCSAQFunction(
        obs_size + goal_size, action_size,
        n_hidden_channels=args.n_hidden_channels,
        n_hidden_layers=args.n_hidden_layers)
    pi = policy.FCDeterministicPolicy(
        obs_size + goal_size, action_size=action_size,
        n_hidden_channels=args.n_hidden_channels,
        n_hidden_layers=args.n_hidden_layers,
        min_action=action_space.low, max_action=action_space.high,
        bound_action=True)
    model = DDPGModel(q_func=q_func, policy=pi)
    opt_a = optimizers.Adam(alpha=args.actor_lr)
    opt_c = optimizers.Adam(alpha=args.critic_lr)
    opt_a.setup(model['policy'])
    opt_c.setup(model['q_function'])
    opt_a.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_a')
    opt_c.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_c')

    rbuf = replay_buffer.HindsightReplayBuffer(reward_function,
        10 ** 6,
        future_k=4)

    def phi(dict_state):
        return np.concatenate(
            (dict_state['observation'].astype(np.float32, copy=False),
            dict_state['desired_goal'].astype(np.float32, copy=False)), 0)

    # Normalize observations based on their empirical mean and variance
    obs_normalizer = chainerrl.links.EmpiricalNormalization(
        obs_size + goal_size, clip_threshold=args.clip_threshold)

    explorer = HERExplorer(args.noise_std,
        args.epsilon,
        action_space)
    agent = DDPG(model, opt_a, opt_c, rbuf,
                 obs_normalizer=obs_normalizer,
                 gamma=args.gamma,
                 explorer=explorer,
                 replay_start_size=args.replay_start_size,
                 phi=phi,
                 target_update_method=args.target_update_method,
                 target_update_interval=args.target_update_interval,
                 update_interval=args.update_interval,
                 soft_update_tau=args.soft_update_tau,
                 n_times_update=args.n_update_times,
                 gpu=args.gpu,
                 minibatch_size=args.minibatch_size,
                 clip_critic_tgt=(-1.0/(1.0-args.gamma), 0.0))

    if len(args.load) > 0:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=make_batch_env(test=True),
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_batch_with_evaluation(
            agent=agent, env=make_batch_env(test=False), steps=args.steps,
            eval_env=make_batch_env(test=True), eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs, eval_interval=args.eval_interval,
            outdir=args.outdir,
            max_episode_len=timestep_limit)
Пример #12
0
def make_agent_ddpg(args, env):
    obs_size = np.asarray(env.observation_space.shape).prod()
    action_space = env.action_space

    action_size = np.asarray(action_space.shape).prod()
    q_func = FCSAQFunction(
        obs_size, action_size,
        n_hidden_channels=args.n_hidden_channels,
        n_hidden_layers=args.n_hidden_layers)
    pi = FCDeterministicPolicy(
        obs_size, action_size=action_size,
        n_hidden_channels=args.n_hidden_channels,
        n_hidden_layers=args.n_hidden_layers,
        min_action=action_space.low, max_action=action_space.high,
        bound_action=True)
    if args.gpu > -1:
        q_func.to_gpu(args.gpu)
        pi.to_gpu(args.gpu)
    else:
        q_func.to_cpu()
        pi.to_cpu()
    model = DDPGModel(q_func=q_func, policy=pi)
    opt_a = optimizers.Adam(alpha=args.actor_lr)
    opt_c = optimizers.Adam(alpha=args.critic_lr)
    opt_a.setup(model['policy'])
    opt_c.setup(model['q_function'])
    opt_a.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_a')
    opt_c.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_c')

    rbuf = replay_buffer.ReplayBuffer(5 * 10 ** 5)

    def phi(obs):
        return obs.astype(np.float32)

    # def random_action():
    #    a = action_space.sample()
    #    if isinstance(a, np.ndarray):
    #        a = a.astype(np.float32)
    #    return a

    ou_sigma = (action_space.high - action_space.low) * 0.2
    explorer = explorers.AdditiveOU(sigma=ou_sigma)
    if args.skip_step == 0:
        agent = DDPG(model, opt_a, opt_c, rbuf, gamma=args.gamma,
                     explorer=explorer, replay_start_size=args.replay_start_size,
                     target_update_method=args.target_update_method,
                     target_update_interval=args.target_update_interval,
                     update_interval=args.update_interval,
                     soft_update_tau=args.soft_update_tau,
                     n_times_update=args.n_update_times,
                     phi=phi, gpu=args.gpu, minibatch_size=args.minibatch_size)
    else:
        agent = DDPGStep(model, opt_a, opt_c, rbuf, gamma=args.gamma,
                         explorer=explorer, replay_start_size=args.replay_start_size,
                         target_update_method=args.target_update_method,
                         target_update_interval=args.target_update_interval,
                         update_interval=args.update_interval,
                         soft_update_tau=args.soft_update_tau,
                         n_times_update=args.n_update_times,
                         phi=phi, gpu=args.gpu, minibatch_size=args.minibatch_size, skip_step=args.skip_step)
        if args.model_dir is not None:
            agent.save(args.model_dir)
    return agent
Пример #13
0
def main():

    # 強化学習のパラメータ
    gamma = 0.995
    num_episodes = 100  #総試行回数

    # DDPGセットアップ
    q_func = QFunction()  # Q関数
    policy = PolicyNetwork()  # ポリシーネットワーク
    model = DDPGModel(q_func=q_func, policy=policy)
    optimizer_p = chainer.optimizers.Adam(alpha=1e-4)
    optimizer_q = chainer.optimizers.Adam(alpha=1e-3)
    optimizer_p.setup(model['policy'])
    optimizer_q.setup(model['q_function'])

    explorer = chainerrl.explorers.AdditiveOU(sigma=1.0)  # sigmaで付与するノイズの強さを設定
    replay_buffer = chainerrl.replay_buffer.ReplayBuffer(capacity=10**6)
    phi = lambda x: x.astype(np.float32, copy=False)

    agent = DDPG(model,
                 optimizer_p,
                 optimizer_q,
                 replay_buffer,
                 gamma=gamma,
                 explorer=explorer,
                 replay_start_size=1000,
                 target_update_method='soft',
                 target_update_interval=1,
                 update_interval=4,
                 soft_update_tau=0.01,
                 n_times_update=1,
                 phi=phi,
                 gpu=-1,
                 minibatch_size=200)

    def reward_filter(r):  # 報酬値を小さくする(0〜1の範囲になるようにする)
        return r * 0.01

    outdir = 'result'
    chainerrl.misc.set_random_seed(0)
    env = gym.make('SpaceInvaders-v0')  #スペースインベーダーの環境呼び出し
    env.seed(0)
    chainerrl.misc.env_modifiers.make_reward_filtered(env, reward_filter)
    env = gym.wrappers.Monitor(env, outdir)  # 動画を保存

    # エピソードの試行&強化学習スタート
    for episode in range(1, num_episodes + 1):  #試行数分繰り返す
        done = False
        reward = 0
        n_steps = 0
        total_reward = 0
        obs = env.reset()
        obs = np.asarray(obs.transpose(2, 0, 1), dtype=np.float32)
        while not done:
            action = agent.act_and_train(obs, reward)  # actionは連続値
            action = F.argmax(action).data  # 出力値が最大の行動を選択
            obs, reward, done, info = env.step(action)  # actionを実行
            total_reward += reward
            n_steps += 1
            obs = np.asarray(obs.transpose(2, 0, 1), dtype=np.float32)
            print('{0:4d}: action {1}, reward {2}, done? {3}, {4}'.format(
                n_steps, action, reward, done, info))
        agent.stop_episode_and_train(obs, reward, done)
        print('Episode {0:4d}: total reward {1}, n_steps {2}, statistics: {3}'.
              format(episode, total_reward, n_steps, agent.get_statistics()))
        if episode % 10 == 0:
            agent.save('agent_DDPG_spaceinvaders_' + str(episode))
Пример #14
0
rbuf = replay_buffer.ReplayBuffer(args.replay_buffer_size)
ou_sigma = (action_space.high - action_space.low) * 0.2

#Possible explorers
explorer = explorers.AdditiveOU(sigma=ou_sigma)
#explorer = chainerrl.explorers.ConstantEpsilonGreedy(epsilon=0.2, random_action_func=env.action_space.sample)

# The agent
agent = DDPG(model,
             opt_actor,
             opt_critic,
             rbuf,
             gamma=args.gamma,
             explorer=explorer,
             replay_start_size=args.replay_start_size,
             target_update_method=args.target_update_method,
             target_update_interval=args.target_update_interval,
             update_interval=args.update_interval,
             soft_update_tau=args.soft_update_tau,
             n_times_update=args.number_of_update_times,
             phi=phi,
             minibatch_size=args.minibatch_size)

G = []
G_mean = []
best_reward = -1000
#chainer.cuda.get_device_from_id(1).use() #Para la GPU corregir
for ep in range(1, args.num_episodes + 1):

    obs = env.reset(project=True)
    state_desc = env.get_state_desc()