Пример #1
0
 def create_dyna_flow(self, agent, env):
     flow = DynaFlow(
         train_sample_count_func=lambda: get_global_status_collect()
         ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
         config_or_config_dict={
             "TEST_ALGO_EVERY_REAL_SAMPLE_COUNT": 10,
             "TEST_DYNAMICS_EVERY_REAL_SAMPLE_COUNT": 10,
             "TRAIN_ALGO_EVERY_REAL_SAMPLE_COUNT_FROM_REAL_ENV": 10,
             "TRAIN_ALGO_EVERY_REAL_SAMPLE_COUNT_FROM_DYNAMICS_ENV": 10,
             "TRAIN_DYNAMICS_EVERY_REAL_SAMPLE_COUNT": 10,
             "START_TRAIN_ALGO_AFTER_SAMPLE_COUNT": 1,
             "START_TRAIN_DYNAMICS_AFTER_SAMPLE_COUNT": 1,
             "START_TEST_ALGO_AFTER_SAMPLE_COUNT": 1,
             "START_TEST_DYNAMICS_AFTER_SAMPLE_COUNT": 1,
             "WARM_UP_DYNAMICS_SAMPLES": 1
         },
         func_dict={
             'train_algo': {
                 'func': agent.train,
                 'args': list(),
                 'kwargs': dict(state='state_agent_training')
             },
             'train_algo_from_synthesized_data': {
                 'func': agent.train,
                 'args': list(),
                 'kwargs': dict(state='state_agent_training')
             },
             'train_dynamics': {
                 'func': agent.train,
                 'args': list(),
                 'kwargs': dict(state='state_dynamics_training')
             },
             'test_algo': {
                 'func': agent.test,
                 'args': list(),
                 'kwargs': dict(sample_count=10)
             },
             'test_dynamics': {
                 'func': agent.algo.test_dynamics,
                 'args': list(),
                 'kwargs': dict(sample_count=10, env=env)
             },
             'sample_from_real_env': {
                 'func':
                 agent.sample,
                 'args':
                 list(),
                 'kwargs':
                 dict(sample_count=10,
                      env=agent.env,
                      in_which_status='TRAIN',
                      store_flag=True)
             },
             'sample_from_dynamics_env': {
                 'func':
                 agent.sample,
                 'args':
                 list(),
                 'kwargs':
                 dict(sample_count=10,
                      env=agent.algo.dynamics_env,
                      in_which_status='TRAIN',
                      store_flag=True)
             }
         })
     return flow, locals()
Пример #2
0
def pendulum_task_fn():
    GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT',
                       exp_config['DEFAULT_EXPERIMENT_END_POINT'])

    env = make('Pendulum-v0')
    name = 'benchmark'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              **exp_config['MLPQValueFunction'])
    policy = DeterministicMLPPolicy(env_spec=env_spec,
                                    name_scope=name + '_mlp_policy',
                                    name=name + '_mlp_policy',
                                    output_low=env_spec.action_space.low,
                                    output_high=env_spec.action_space.high,
                                    **exp_config['DeterministicMLPPolicy'],
                                    reuse=False)

    ddpg = DDPG(env_spec=env_spec,
                policy=policy,
                value_func=mlp_q,
                name=name + '_ddpg',
                **exp_config['DDPG'])

    mlp_dyna = ContinuousMLPGlobalDynamicsModel(
        env_spec=env_spec,
        name_scope=name + '_mlp_dyna',
        name=name + '_mlp_dyna',
        output_low=env_spec.obs_space.low,
        output_high=env_spec.obs_space.high,
        **exp_config['DynamicsModel'])
    algo = Dyna(env_spec=env_spec,
                name=name + '_dyna_algo',
                model_free_algo=ddpg,
                dynamics_model=mlp_dyna,
                config_or_config_dict=dict(dynamics_model_train_iter=10,
                                           model_free_algo_train_iter=10))
    algo.set_terminal_reward_function_for_dynamics_env(
        terminal_func=FixedEpisodeLengthTerminalFunc(
            max_step_length=env.unwrapped._max_episode_steps,
            step_count_fn=algo.dynamics_env.total_step_count_fn),
        reward_func=REWARD_FUNC_DICT['Pendulum-v0']())
    agent = Agent(env=env,
                  env_spec=env_spec,
                  algo=algo,
                  exploration_strategy=None,
                  noise_adder=AgentActionNoiseWrapper(
                      noise=NormalActionNoise(),
                      noise_weight_scheduler=ConstantSchedule(value=0.3),
                      action_weight_scheduler=ConstantSchedule(value=1.0)),
                  name=name + '_agent')

    flow = DynaFlow(
        train_sample_count_func=lambda: get_global_status_collect()
        ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
        config_or_config_dict=exp_config['DynaFlow'],
        func_dict={
            'train_algo': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(state='state_agent_training')
            },
            'train_algo_from_synthesized_data': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(state='state_agent_training', train_iter=1)
            },
            'train_dynamics': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(state='state_dynamics_training')
            },
            'test_algo': {
                'func': agent.test,
                'args': list(),
                'kwargs': dict(sample_count=1, sample_trajectory_flag=True)
            },
            'test_dynamics': {
                'func': agent.algo.test_dynamics,
                'args': list(),
                'kwargs': dict(sample_count=10, env=env)
            },
            'sample_from_real_env': {
                'func':
                agent.sample,
                'args':
                list(),
                'kwargs':
                dict(sample_count=10,
                     env=agent.env,
                     in_which_status='TRAIN',
                     store_flag=True)
            },
            'sample_from_dynamics_env': {
                'func':
                agent.sample,
                'args':
                list(),
                'kwargs':
                dict(sample_count=50,
                     sample_type='transition',
                     env=agent.algo.dynamics_env,
                     in_which_status='TRAIN',
                     store_flag=False)
            }
        })

    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name)
    experiment.run()
Пример #3
0
def task_fn():
    env = make('Pendulum-v0')
    name = 'demo_exp'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_q = MLPQValueFunction(env_spec=env_spec,
                              name_scope=name + '_mlp_q',
                              name=name + '_mlp_q',
                              mlp_config=[{
                                  "ACT": "RELU",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "1",
                                  "N_UNITS": 16,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }, {
                                  "ACT": "LINEAR",
                                  "B_INIT_VALUE": 0.0,
                                  "NAME": "OUPTUT",
                                  "N_UNITS": 1,
                                  "TYPE": "DENSE",
                                  "W_NORMAL_STDDEV": 0.03
                              }])
    policy = DeterministicMLPPolicy(env_spec=env_spec,
                                    name_scope=name + '_mlp_policy',
                                    name=name + '_mlp_policy',
                                    mlp_config=[{
                                        "ACT": "RELU",
                                        "B_INIT_VALUE": 0.0,
                                        "NAME": "1",
                                        "N_UNITS": 16,
                                        "TYPE": "DENSE",
                                        "W_NORMAL_STDDEV": 0.03
                                    }, {
                                        "ACT": "LINEAR",
                                        "B_INIT_VALUE": 0.0,
                                        "NAME": "OUPTUT",
                                        "N_UNITS": env_spec.flat_action_dim,
                                        "TYPE": "DENSE",
                                        "W_NORMAL_STDDEV": 0.03
                                    }],
                                    reuse=False)

    ddpg = DDPG(env_spec=env_spec,
                config_or_config_dict={
                    "REPLAY_BUFFER_SIZE": 10000,
                    "GAMMA": 0.999,
                    "Q_NET_L1_NORM_SCALE": 0.01,
                    "Q_NET_L2_NORM_SCALE": 0.01,
                    "CRITIC_LEARNING_RATE": 0.001,
                    "ACTOR_LEARNING_RATE": 0.001,
                    "DECAY": 0.5,
                    "BATCH_SIZE": 50,
                    "TRAIN_ITERATION": 1,
                    "critic_clip_norm": 0.1,
                    "actor_clip_norm": 0.1,
                },
                value_func=mlp_q,
                policy=policy,
                name=name + '_ddpg',
                replay_buffer=None)

    mlp_dyna = ContinuousMLPGlobalDynamicsModel(
        env_spec=env_spec,
        name_scope=name + '_mlp_dyna',
        name=name + '_mlp_dyna',
        output_low=env_spec.obs_space.low,
        output_high=env_spec.obs_space.high,
        learning_rate=0.01,
        mlp_config=[{
            "ACT": "RELU",
            "B_INIT_VALUE": 0.0,
            "NAME": "1",
            "L1_NORM": 0.0,
            "L2_NORM": 0.0,
            "N_UNITS": 16,
            "TYPE": "DENSE",
            "W_NORMAL_STDDEV": 0.03
        }, {
            "ACT": "LINEAR",
            "B_INIT_VALUE": 0.0,
            "NAME": "OUPTUT",
            "L1_NORM": 0.0,
            "L2_NORM": 0.0,
            "N_UNITS": env_spec.flat_obs_dim,
            "TYPE": "DENSE",
            "W_NORMAL_STDDEV": 0.03
        }])
    algo = Dyna(env_spec=env_spec,
                name=name + '_dyna_algo',
                model_free_algo=ddpg,
                dynamics_model=mlp_dyna,
                config_or_config_dict=dict(dynamics_model_train_iter=10,
                                           model_free_algo_train_iter=10))
    # For examples only, we use random reward function and terminal function with fixed episode length.
    algo.set_terminal_reward_function_for_dynamics_env(
        terminal_func=FixedEpisodeLengthTerminalFunc(
            max_step_length=env.unwrapped._max_episode_steps,
            step_count_fn=algo.dynamics_env.total_step_count_fn),
        reward_func=RandomRewardFunc())
    agent = Agent(
        env=env,
        env_spec=env_spec,
        algo=algo,
        algo_saving_scheduler=PeriodicalEventSchedule(
            t_fn=lambda: get_global_status_collect()
            ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
            trigger_every_step=20,
            after_t=10),
        name=name + '_agent',
        exploration_strategy=EpsilonGreedy(action_space=env_spec.action_space,
                                           init_random_prob=0.5))
    flow = DynaFlow(
        train_sample_count_func=lambda: get_global_status_collect()
        ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
        config_or_config_dict={
            "TRAIN_ALGO_EVERY_REAL_SAMPLE_COUNT_FROM_REAL_ENV": 10,
            "TRAIN_ALGO_EVERY_REAL_SAMPLE_COUNT_FROM_DYNAMICS_ENV": 10,
            "TEST_ALGO_EVERY_REAL_SAMPLE_COUNT": 10,
            "TEST_DYNAMICS_EVERY_REAL_SAMPLE_COUNT": 10,
            "TRAIN_DYNAMICS_EVERY_REAL_SAMPLE_COUNT": 10,
            "START_TRAIN_ALGO_AFTER_SAMPLE_COUNT": 1,
            "START_TRAIN_DYNAMICS_AFTER_SAMPLE_COUNT": 1,
            "START_TEST_ALGO_AFTER_SAMPLE_COUNT": 1,
            "START_TEST_DYNAMICS_AFTER_SAMPLE_COUNT": 1,
            "WARM_UP_DYNAMICS_SAMPLES": 1
        },
        func_dict={
            'train_algo': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(state='state_agent_training')
            },
            'train_algo_from_synthesized_data': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(state='state_agent_training')
            },
            'train_dynamics': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(state='state_dynamics_training')
            },
            'test_algo': {
                'func': agent.test,
                'args': list(),
                'kwargs': dict(sample_count=10)
            },
            'test_dynamics': {
                'func': agent.algo.test_dynamics,
                'args': list(),
                'kwargs': dict(sample_count=10, env=env)
            },
            'sample_from_real_env': {
                'func':
                agent.sample,
                'args':
                list(),
                'kwargs':
                dict(sample_count=10,
                     env=agent.env,
                     in_which_status='TRAIN',
                     store_flag=True)
            },
            'sample_from_dynamics_env': {
                'func':
                agent.sample,
                'args':
                list(),
                'kwargs':
                dict(sample_count=10,
                     env=agent.algo.dynamics_env,
                     in_which_status='TRAIN',
                     store_flag=True)
            }
        })

    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name + '_exp')
    experiment.run()
Пример #4
0
def pendulum_task_fn():
    exp_config = PENDULUM_BENCHMARK_CONFIG_DICT
    GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT',
                       exp_config['DEFAULT_EXPERIMENT_END_POINT'])

    env = make('Pendulum-v0')
    name = 'benchmark'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_dyna = ContinuousMLPGlobalDynamicsModel(
        env_spec=env_spec,
        name_scope=name + '_mlp_dyna',
        name=name + '_mlp_dyna',
        **exp_config['DynamicsModel']
    )
    algo = ModelPredictiveControl(
        dynamics_model=mlp_dyna,
        env_spec=env_spec,
        config_or_config_dict=exp_config['MPC'],
        name=name + '_mpc',
        policy=UniformRandomPolicy(env_spec=env_spec, name='uni_policy')
    )
    algo.set_terminal_reward_function_for_dynamics_env(reward_func=REWARD_FUNC_DICT['Pendulum-v0'](),
                                                       terminal_func=FixedEpisodeLengthTerminalFunc(
                                                           max_step_length=env.unwrapped._max_episode_steps,
                                                           step_count_fn=algo.dynamics_env.total_step_count_fn), )
    agent = Agent(env=env, env_spec=env_spec,
                  algo=algo,
                  exploration_strategy=None,
                  noise_adder=None,
                  name=name + '_agent')

    flow = DynaFlow(
        train_sample_count_func=lambda: get_global_status_collect()('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
        config_or_config_dict=exp_config['DynaFlow'],
        func_dict={
            'train_dynamics': {'func': agent.train,
                               'args': list(),
                               'kwargs': dict()},
            'train_algo': None,
            'test_algo': {'func': agent.test,
                          'args': list(),
                          'kwargs': dict(sample_count=1, sample_trajectory_flag=True)},
            'test_dynamics': {'func': agent.algo.test_dynamics,
                              'args': list(),
                              'kwargs': dict(sample_count=100, env=env)},
            'sample_from_real_env': {'func': agent.sample,
                                     'args': list(),
                                     'kwargs': dict(sample_count=10,
                                                    env=agent.env,
                                                    in_which_status='TRAIN',
                                                    store_flag=True)},
            'sample_from_dynamics_env': None,
            'train_algo_from_synthesized_data': None
        }
    )

    experiment = Experiment(
        tuner=None,
        env=env,
        agent=agent,
        flow=flow,
        name=name
    )
    experiment.run()
Пример #5
0
def pendulum_task_fn():
    exp_config = PENDULUM_BENCHMARK_CONFIG_DICT
    GlobalConfig().set('DEFAULT_EXPERIMENT_END_POINT',
                       exp_config['DEFAULT_EXPERIMENT_END_POINT'])

    env = make('Pendulum-v0')
    name = 'benchmark'
    env_spec = EnvSpec(obs_space=env.observation_space,
                       action_space=env.action_space)

    mlp_dyna = ContinuousMLPGlobalDynamicsModel(
        env_spec=env_spec,
        name_scope=name + '_mlp_dyna',
        name=name + '_mlp_dyna',
        output_low=env_spec.obs_space.low,
        output_high=env_spec.obs_space.high,
        **exp_config['DynamicsModel'])
    dyna_env = DynamicsEnvWrapper(mlp_dyna)
    dyna_env.set_terminal_reward_func(
        terminal_func=FixedEpisodeLengthTerminalFunc(
            max_step_length=env.unwrapped._max_episode_steps,
            step_count_fn=dyna_env.total_step_count_fn),
        reward_func=REWARD_FUNC_DICT['Pendulum-v0']())

    policy = iLQRPolicy(env_spec=env_spec,
                        **exp_config['ILQR'],
                        dynamics=dyna_env,
                        cost_fn=RewardFuncCostWrapper(
                            reward_func=REWARD_FUNC_DICT['Pendulum-v0']()))

    algo = iLQRAlogWrapper(policy=policy,
                           env_spec=env_spec,
                           dynamics_env=dyna_env)

    agent = Agent(env=env,
                  env_spec=env_spec,
                  algo=algo,
                  exploration_strategy=None,
                  noise_adder=None,
                  name=name + '_agent')

    flow = DynaFlow(
        train_sample_count_func=lambda: get_global_status_collect()
        ('TOTAL_AGENT_TRAIN_SAMPLE_COUNT'),
        config_or_config_dict=exp_config['DynaFlow'],
        func_dict={
            'train_dynamics': {
                'func': agent.train,
                'args': list(),
                'kwargs': dict(state='state_dynamics_training')
            },
            'train_algo': None,
            'test_algo': {
                'func': agent.test,
                'args': list(),
                'kwargs': dict(sample_count=1, sample_trajectory_flag=True)
            },
            'test_dynamics': {
                'func': agent.algo.test_dynamics,
                'args': list(),
                'kwargs': dict(sample_count=100, env=env)
            },
            'sample_from_real_env': {
                'func':
                agent.sample,
                'args':
                list(),
                'kwargs':
                dict(sample_count=10,
                     env=agent.env,
                     in_which_status='TRAIN',
                     store_flag=True)
            },
            'sample_from_dynamics_env': None,
            'train_algo_from_synthesized_data': None
        })

    experiment = Experiment(tuner=None,
                            env=env,
                            agent=agent,
                            flow=flow,
                            name=name)
    experiment.run()