예제 #1
0
def run_trial(planning_horizon):

    blocks_world_builder = BlocksWorldBuilder(blocks_world_size)
    ctrl = SimpleMonteCarloControl()
    planner = Planner(planning_horizon)
    mc = MonteCarlo(blocks_world_builder,
                    planner,
                    control=ctrl,
                    max_episode_length=blocks_world_size * 2,
                    planning_factor=0,
                    plan_on_empty_policy=True,
                    exploring_starts=True,
                    exploring_factor=0)

    mc.learn_policy(number_episodes=number_of_episodes,
                    show_progress_bar=True,
                    evaluate_return_ratio=False)

    data = pd.DataFrame({
        'episode': range(len(mc.returns)),
        #'return_ratio': mc.return_ratios,
        'observed_returns': mc.returns,
        #'optimal_returns': mc.optimal_returns
    })

    return data
예제 #2
0
    def test_blocksworld_optimal_return(self):

        mdp_builder = BlocksWorldBuilder(blocks_world_size=5)
        mdp = BlocksWorld(state_initial={
            'on(b2,b1)', 'on(b0,b3)', 'on(b4,table)', 'on(b1,table)',
            'on(b3,table)'
        },
                          state_static={
                              'subgoal(b0,table)', 'subgoal(b1,b0)',
                              'subgoal(b2,b1)', 'subgoal(b3,b2)',
                              'subgoal(b4,b3)'
                          })

        planner = PlannerPolicy(planning_horizon=2 * 5 + 1,
                                mdp_builder=mdp_builder)

        self.assertEqual(94,
                         planner.compute_optimal_return_for_state(mdp.state))
예제 #3
0
    def test_blocksworld_1(self):

        mdp_builder = BlocksWorldBuilder(blocks_world_size=2)
        mdp = BlocksWorld(state_initial={'on(b0,table)', 'on(b1,table)'},
                          state_static={'subgoal(b1,b0)'})

        planner = PlannerPolicy(planning_horizon=1, mdp_builder=mdp_builder)

        suggested_action, expected_return = planner.suggest_action_and_return_for_state(
            mdp.state)

        self.assertEqual(suggested_action,
                         planner.suggest_action_for_state(mdp.state))

        mdp.transition(suggested_action)

        self.assertEqual('move(b1,b0)', suggested_action)
        self.assertEqual(mdp.return_history[0], expected_return)
예제 #4
0
import os
import sys

# Make sure the path of the framework is included in the import path
sys.path.insert(
    0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))

from tests import test_policy
from MonteCarlo import MonteCarlo
from mdp import BlocksWorldBuilder
from control import SimpleMonteCarloControl, SgdMonteCarloControl
from planner import Planner

from matplotlib import pyplot as plt

mdp_builder = BlocksWorldBuilder(blocks_world_size=7)
planner = Planner(planning_horizon=5)
ctrl = SimpleMonteCarloControl()
mc = MonteCarlo(mdp_builder,
                planner,
                control=ctrl,
                max_episode_length=14,
                planning_factor=0,
                plan_on_empty_policy=True,
                exploring_starts=True,
                exploring_factor=0.0)
learned_policy = mc.learn_policy(number_episodes=150, show_progress_bar=True)
예제 #5
0
                                   default='True',
                                   choices={'True', 'False'})
    parser_frozenLake.set_defaults(mdp='frozenLake',
                                   behavior_policy='planning_epsilon_greedy')

    args = parser.parse_args()

    initial_value_estimate = -1

    gym_active = False
    is_slippery = False
    frozen_lake_active = False
    frozen_lake_level = ""
    gym_env = ""
    if args.mdp == 'blocksworld':
        mdp_builder = BlocksWorldBuilder(args.blocks_world_size)
    elif args.mdp == 'sokoban':
        mdp_builder = SokobanBuilder(args.sokoban_level_name)
    elif args.mdp == 'frozenLake':
        mdp_builder = FrozenLakeBuilder(args.frozen_lake_level,
                                        args.is_cautious == 'True')
        frozen_lake_active = True
        if args.gym_environment_active == 'True':
            gym_active = True
            initial_value_estimate = 0
        else:
            gym_active = False
        if args.is_slippery == 'True':
            is_slippery = True
        else:
            is_slippery = False
예제 #6
0
def main(argv):

    setups = list()

    mdp_label = f'{blocks_world_size}-Blocks World'
    mdp_builder = BlocksWorldBuilder(blocks_world_size)

    # Simple monte-carlo method as baseline
    setups += [{
        'mdp_label': mdp_label,
        'mdp_builder': mdp_builder,
        'cls': FirstVisitMonteCarloControl,
        'kwargs': {},
        'plan_for_new_states': plan_for_new_states,
        'label': 'First-visit MC'
    } for plan_for_new_states in tf]

    # Q-Learning with immediate (online) update.
    setups += [{'mdp_label': mdp_label,
                'mdp_builder':mdp_builder,
                'cls':QLearningControl,
                'kwargs':{ 'alpha':alpha },
                'plan_for_new_states': plan_for_new_states,
                'label':f'QL with online update, $\\alpha={alpha}$'} \
               for alpha, plan_for_new_states in itertools.product(alphas, tf)]

    # Q-Learning with reversed update may be better.
    setups += [{'mdp_label': mdp_label,
                'mdp_builder':mdp_builder,
                'cls':QLearningReversedUpdateControl,
                'kwargs':{ 'alpha':alpha },
                'plan_for_new_states': plan_for_new_states,
                'label':f'QL with reversed update after episode, $\\alpha={alpha}$' } \
               for alpha, plan_for_new_states in itertools.product(alphas, tf)]

    db_file, job_id, setup_id = handle_args(argv)

    try:
        setup = setups[setup_id]
    except IndexError:
        print(
            f'Setup id ({setup_id}) out of range. Must be smaller than {len(setups)}.'
        )
        sys.exit(2)

    df = pd.DataFrame()

    target_policy = QTablePolicy()
    behavior_policy = PlanningExploringStartsPolicy(
        PlannerPolicy(planning_horizon, mdp_builder), RandomPolicy(),
        QTablePolicy(), planning_factor, setup['plan_for_new_states'])

    control_class = setup['cls']
    kwargs = {
        **setup['kwargs'], 'target_policy': target_policy,
        'behavior_policy': behavior_policy
    }

    if control_class == FirstVisitMonteCarloControl:
        # This control algorithm is on-policy -> behavior policy = target policy
        # thus behavior policy does not need to be provided as argument.
        kwargs.pop('behavior_policy')

    control = control_class(**kwargs)

    for episode in tqdm(list(range(number_of_episodes))):

        mdp = mdp_builder.build_mdp()

        # Make a completely separate copy of the mdp for evaluation
        mdp_test = copy.deepcopy(mdp)
        control.generate_episode_with_target_policy(
            mdp_test, step_limit=max_episode_length)

        control.learn_episode(mdp, step_limit=max_episode_length)

        row = {
            **setup, 'episode': episode,
            'behavior_policy_return': mdp.return_history[0],
            'target_policy_return': mdp_test.return_history[0],
            'job_id': job_id,
            'setup_id': setup_id
        }

        df = df.append(pd.Series(row), ignore_index=True)

    df.to_csv(db_file)
예제 #7
0
number_of_trials = 20
number_of_episodes = 2000
blocks_world_size = 7
planning_horizon = 5

step_size_parameters = [1, 0.8, 0.3, 0.03]
""" SETUP EXPERIMENT """

experiments = []

for _ in range(number_of_trials):

    # Control case: Monte carlo control such as in the bachelor's project, without planning.

    blocks_world_builder = BlocksWorldBuilder(blocks_world_size)
    planner = Planner(planning_horizon)
    ctrl = SimpleMonteCarloControl()
    mc = MonteCarlo(blocks_world_builder,
                    planner,
                    control=ctrl,
                    max_episode_length=blocks_world_size * 2,
                    planning_factor=0,
                    plan_on_empty_policy=True,
                    exploring_starts=True,
                    exploring_factor=0)

    experiments.append(('Mean-based', None, mc))

for step_size_parameter in step_size_parameters * number_of_trials: