def run_trial(planning_horizon): blocks_world_builder = BlocksWorldBuilder(blocks_world_size) ctrl = SimpleMonteCarloControl() planner = Planner(planning_horizon) mc = MonteCarlo(blocks_world_builder, planner, control=ctrl, max_episode_length=blocks_world_size * 2, planning_factor=0, plan_on_empty_policy=True, exploring_starts=True, exploring_factor=0) mc.learn_policy(number_episodes=number_of_episodes, show_progress_bar=True, evaluate_return_ratio=False) data = pd.DataFrame({ 'episode': range(len(mc.returns)), #'return_ratio': mc.return_ratios, 'observed_returns': mc.returns, #'optimal_returns': mc.optimal_returns }) return data
def test_blocksworld_optimal_return(self): mdp_builder = BlocksWorldBuilder(blocks_world_size=5) mdp = BlocksWorld(state_initial={ 'on(b2,b1)', 'on(b0,b3)', 'on(b4,table)', 'on(b1,table)', 'on(b3,table)' }, state_static={ 'subgoal(b0,table)', 'subgoal(b1,b0)', 'subgoal(b2,b1)', 'subgoal(b3,b2)', 'subgoal(b4,b3)' }) planner = PlannerPolicy(planning_horizon=2 * 5 + 1, mdp_builder=mdp_builder) self.assertEqual(94, planner.compute_optimal_return_for_state(mdp.state))
def test_blocksworld_1(self): mdp_builder = BlocksWorldBuilder(blocks_world_size=2) mdp = BlocksWorld(state_initial={'on(b0,table)', 'on(b1,table)'}, state_static={'subgoal(b1,b0)'}) planner = PlannerPolicy(planning_horizon=1, mdp_builder=mdp_builder) suggested_action, expected_return = planner.suggest_action_and_return_for_state( mdp.state) self.assertEqual(suggested_action, planner.suggest_action_for_state(mdp.state)) mdp.transition(suggested_action) self.assertEqual('move(b1,b0)', suggested_action) self.assertEqual(mdp.return_history[0], expected_return)
import os import sys # Make sure the path of the framework is included in the import path sys.path.insert( 0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) from tests import test_policy from MonteCarlo import MonteCarlo from mdp import BlocksWorldBuilder from control import SimpleMonteCarloControl, SgdMonteCarloControl from planner import Planner from matplotlib import pyplot as plt mdp_builder = BlocksWorldBuilder(blocks_world_size=7) planner = Planner(planning_horizon=5) ctrl = SimpleMonteCarloControl() mc = MonteCarlo(mdp_builder, planner, control=ctrl, max_episode_length=14, planning_factor=0, plan_on_empty_policy=True, exploring_starts=True, exploring_factor=0.0) learned_policy = mc.learn_policy(number_episodes=150, show_progress_bar=True)
default='True', choices={'True', 'False'}) parser_frozenLake.set_defaults(mdp='frozenLake', behavior_policy='planning_epsilon_greedy') args = parser.parse_args() initial_value_estimate = -1 gym_active = False is_slippery = False frozen_lake_active = False frozen_lake_level = "" gym_env = "" if args.mdp == 'blocksworld': mdp_builder = BlocksWorldBuilder(args.blocks_world_size) elif args.mdp == 'sokoban': mdp_builder = SokobanBuilder(args.sokoban_level_name) elif args.mdp == 'frozenLake': mdp_builder = FrozenLakeBuilder(args.frozen_lake_level, args.is_cautious == 'True') frozen_lake_active = True if args.gym_environment_active == 'True': gym_active = True initial_value_estimate = 0 else: gym_active = False if args.is_slippery == 'True': is_slippery = True else: is_slippery = False
def main(argv): setups = list() mdp_label = f'{blocks_world_size}-Blocks World' mdp_builder = BlocksWorldBuilder(blocks_world_size) # Simple monte-carlo method as baseline setups += [{ 'mdp_label': mdp_label, 'mdp_builder': mdp_builder, 'cls': FirstVisitMonteCarloControl, 'kwargs': {}, 'plan_for_new_states': plan_for_new_states, 'label': 'First-visit MC' } for plan_for_new_states in tf] # Q-Learning with immediate (online) update. setups += [{'mdp_label': mdp_label, 'mdp_builder':mdp_builder, 'cls':QLearningControl, 'kwargs':{ 'alpha':alpha }, 'plan_for_new_states': plan_for_new_states, 'label':f'QL with online update, $\\alpha={alpha}$'} \ for alpha, plan_for_new_states in itertools.product(alphas, tf)] # Q-Learning with reversed update may be better. setups += [{'mdp_label': mdp_label, 'mdp_builder':mdp_builder, 'cls':QLearningReversedUpdateControl, 'kwargs':{ 'alpha':alpha }, 'plan_for_new_states': plan_for_new_states, 'label':f'QL with reversed update after episode, $\\alpha={alpha}$' } \ for alpha, plan_for_new_states in itertools.product(alphas, tf)] db_file, job_id, setup_id = handle_args(argv) try: setup = setups[setup_id] except IndexError: print( f'Setup id ({setup_id}) out of range. Must be smaller than {len(setups)}.' ) sys.exit(2) df = pd.DataFrame() target_policy = QTablePolicy() behavior_policy = PlanningExploringStartsPolicy( PlannerPolicy(planning_horizon, mdp_builder), RandomPolicy(), QTablePolicy(), planning_factor, setup['plan_for_new_states']) control_class = setup['cls'] kwargs = { **setup['kwargs'], 'target_policy': target_policy, 'behavior_policy': behavior_policy } if control_class == FirstVisitMonteCarloControl: # This control algorithm is on-policy -> behavior policy = target policy # thus behavior policy does not need to be provided as argument. kwargs.pop('behavior_policy') control = control_class(**kwargs) for episode in tqdm(list(range(number_of_episodes))): mdp = mdp_builder.build_mdp() # Make a completely separate copy of the mdp for evaluation mdp_test = copy.deepcopy(mdp) control.generate_episode_with_target_policy( mdp_test, step_limit=max_episode_length) control.learn_episode(mdp, step_limit=max_episode_length) row = { **setup, 'episode': episode, 'behavior_policy_return': mdp.return_history[0], 'target_policy_return': mdp_test.return_history[0], 'job_id': job_id, 'setup_id': setup_id } df = df.append(pd.Series(row), ignore_index=True) df.to_csv(db_file)
number_of_trials = 20 number_of_episodes = 2000 blocks_world_size = 7 planning_horizon = 5 step_size_parameters = [1, 0.8, 0.3, 0.03] """ SETUP EXPERIMENT """ experiments = [] for _ in range(number_of_trials): # Control case: Monte carlo control such as in the bachelor's project, without planning. blocks_world_builder = BlocksWorldBuilder(blocks_world_size) planner = Planner(planning_horizon) ctrl = SimpleMonteCarloControl() mc = MonteCarlo(blocks_world_builder, planner, control=ctrl, max_episode_length=blocks_world_size * 2, planning_factor=0, plan_on_empty_policy=True, exploring_starts=True, exploring_factor=0) experiments.append(('Mean-based', None, mc)) for step_size_parameter in step_size_parameters * number_of_trials: