def test_lspi(): mdp = CartPole() np.random.seed(1) # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Agent basis = [PolynomialBasis()] features = Features(basis_list=basis) approximator_params = dict(input_shape=(features.size, ), output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n) agent = LSPI(pi, mdp.info, fit_params=dict(), approximator_params=approximator_params, features=features) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=100, n_episodes_per_fit=100) w = agent.approximator.get_weights() w_test = np.array([-2.23880597, -2.27427603, -2.25]) assert np.allclose(w, w_test)
def build_low_level_agent(alg, params, mdp): features = Features( basis_list=[PolynomialBasis(dimensions=[0], degrees=[1])]) pi = DeterministicControlPolicy(weights=np.array([0])) mu = np.zeros(pi.weights_size) sigma = 1e-3 * np.ones(pi.weights_size) distribution = GaussianDiagonalDistribution(mu, sigma) mdp_info_agent2 = MDPInfo(observation_space=spaces.Box( -np.pi, np.pi, (1, )), action_space=mdp.info.action_space, gamma=mdp.info.gamma, horizon=100) agent = alg(distribution, pi, mdp_info_agent2, features=features, **params) return agent
def experiment(): np.random.seed() # MDP mdp = CartPole() # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Agent basis = [PolynomialBasis()] s1 = np.array([-np.pi, 0, np.pi]) * .25 s2 = np.array([-1, 0, 1]) for i in s1: for j in s2: basis.append(GaussianRBF(np.array([i, j]), np.array([1.]))) features = Features(basis_list=basis) fit_params = dict() approximator_params = dict(input_shape=(features.size, ), output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n) agent = LSPI(pi, mdp.info, fit_params=fit_params, approximator_params=approximator_params, features=features) # Algorithm core = Core(agent, mdp) core.evaluate(n_episodes=3, render=True) # Train core.learn(n_episodes=100, n_episodes_per_fit=100) # Test test_epsilon = Parameter(0.) agent.policy.set_epsilon(test_epsilon) dataset = core.evaluate(n_episodes=1, quiet=True) core.evaluate(n_steps=100, render=True) return np.mean(episodes_length(dataset))
def experiment(n_epochs, ep_per_epoch_train, ep_per_epoch_eval, n_iterations): np.random.seed() # MDP mdp = PreyPredator() basis = PolynomialBasis.generate(1, mdp.info.observation_space.shape[0]) phi = Features(basis_list=basis[1:]) # Features approximator = Regressor(LinearApproximator, input_shape=(phi.size, ), output_shape=mdp.info.action_space.shape) sigma = 1e-2 * np.eye(mdp.info.action_space.shape[0]) policy = GaussianPolicy(approximator, sigma) lr = Parameter(1e-5) #agent = GPOMDP(policy, mdp.info, lr, phi) agent = KeyboardAgent() # Train core = Core(agent, mdp) dataset = core.evaluate(n_episodes=ep_per_epoch_eval, render=True) J = compute_J(dataset, gamma=mdp.info.gamma) print('Reward at start: ', np.mean(J)) for i in range(n_epochs): core.learn(n_episodes=ep_per_epoch_train, n_episodes_per_fit=ep_per_epoch_train // n_iterations, render=False) dataset = core.evaluate(n_episodes=ep_per_epoch_eval, render=True) J = compute_J(dataset, gamma=mdp.info.gamma) p = policy.get_weights() print('mu: ', p) print('Reward at iteration ', i, ': ', np.mean(J)) print('Press a button to visualize the segway...') input() core.evaluate(n_episodes=3, render=True)
def build_mid_level_agent(alg, params, mdp, mu, std): mu_approximator = Regressor(LinearApproximator, input_shape=(1, ), output_shape=(2, )) w_mu = mu * np.ones(mu_approximator.weights_size) mu_approximator.set_weights(w_mu) pi = DiagonalGaussianPolicy(mu=mu_approximator, std=std * np.ones(2)) lim = mdp.info.observation_space.high[0] basis = PolynomialBasis() features = BasisFeatures(basis=[basis]) mdp_info_agent1 = MDPInfo(observation_space=spaces.Box(0, 1, (1, )), action_space=spaces.Box(0, lim, (2, )), gamma=1, horizon=10) agent = alg(policy=pi, mdp_info=mdp_info_agent1, features=features, **params) return agent
def experiment(): small = True print('ENV IS SMALL? ', small) np.random.seed() # Model Block mdp = ShipSteering(small=small, hard=True, n_steps_action=3) #State Placeholder state_ph = PlaceHolder(name='state_ph') #Reward Placeholder reward_ph = PlaceHolder(name='reward_ph') #Last action Placeholder lastaction_ph = PlaceHolder(name='lastaction_ph') #FeaturesH lim = 150 if small else 1000 tilingsH = Tiles.generate(n_tilings=1, n_tiles=[5, 5], low=[0, 0], high=[lim, lim]) featuresH = Features(tilings=tilingsH) # PolicyH epsilon = LinearDecayParameter(value=0.1, min_value=0.0, n=10000) piH = EpsGreedy(epsilon=epsilon) # AgentH learning_rate = Parameter(value=1) mdp_info_agentH = MDPInfo(observation_space=spaces.Box( low=np.array([0, 0]), high=np.array([lim, lim]), shape=(2, )), action_space=spaces.Discrete(8), gamma=1, horizon=10000) approximator_paramsH = dict(input_shape=(featuresH.size, ), output_shape=mdp_info_agentH.action_space.size, n_actions=mdp_info_agentH.action_space.n) agentH = TrueOnlineSARSALambda(policy=piH, mdp_info=mdp_info_agentH, learning_rate=learning_rate, lambda_coeff=0.9, approximator_params=approximator_paramsH, features=featuresH) # Control Block H control_blockH = ControlBlock(name='control block H', agent=agentH, n_steps_per_fit=1) #FeaturesL featuresL = Features(basis_list=[PolynomialBasis()]) # Policy1 input_shape = (featuresL.size, ) approximator_params = dict(input_dim=input_shape[0]) approximator = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape, **approximator_params) sigma = np.array([[1.3e-2]]) pi1 = GaussianPolicy(mu=approximator, sigma=sigma) # Policy2 pi2 = GaussianPolicy(mu=approximator, sigma=sigma) # Agent1 learning_rate1 = AdaptiveParameter(value=1e-5) agent1 = GPOMDP(pi1, mdp.info, learning_rate1, featuresL) # Agent2 learning_rate2 = AdaptiveParameter(value=1e-5) agent2 = GPOMDP(pi2, mdp.info, learning_rate2, featuresL) #Termination Conds termination_condition1 = TerminationCondition(active_dir=1, small=small) termination_condition2 = TerminationCondition(active_dir=5, small=small) # Control Block + control_block1 = ControlBlock(name='control block 1', agent=agent1, n_eps_per_fit=50, termination_condition=termination_condition1) # Control Block x control_block2 = ControlBlock(name='control block 2', agent=agent2, n_eps_per_fit=50, termination_condition=termination_condition2) # Function Block 1: picks state for hi lev ctrl function_block1 = fBlock(phi=pick_state, name='f1 pickstate') # Function Block 2: maps the env to low lev ctrl state function_block2 = fBlock(phi=rototranslate(small=small), name='f2 rotot') # Function Block 3: holds curr state as ref function_block3 = hold_state(name='f3 holdstate') # Function Block 4: adds hi lev rew function_block4 = addBlock(name='f4 add') # Function Block 5: adds low lev rew function_block5 = addBlock(name='f5 add') # Function Block 6:ext rew of hi lev ctrl function_block6 = fBlock(phi=G_high, name='f6 G_hi') # Function Block 7: ext rew of low lev ctrl function_block7 = fBlock(phi=G_low(small=small), name='f7 G_lo') #Reward Accumulator H: reward_acc_H = reward_accumulator_block(gamma=mdp_info_agentH.gamma, name='reward_acc_H') #Mux_Block mux_block = MuxBlock(name='mux') mux_block.add_block_list([control_block1]) mux_block.add_block_list([control_block2]) #Algorithm blocks = [ state_ph, reward_ph, lastaction_ph, control_blockH, mux_block, function_block1, function_block2, function_block3, function_block4, function_block5, function_block6, function_block7, reward_acc_H ] #state_ph.add_input(mux_block) #reward_ph.add_input(mux_block) #lastaction_ph.add_input(mux_block) reward_acc_H.add_input(reward_ph) reward_acc_H.add_alarm_connection(control_block1) reward_acc_H.add_alarm_connection(control_block2) control_blockH.add_input(function_block1) control_blockH.add_reward(function_block4) control_blockH.add_alarm_connection(control_block1) control_blockH.add_alarm_connection(control_block2) mux_block.add_input(control_blockH) mux_block.add_input(function_block2) control_block1.add_reward(function_block5) control_block2.add_reward(function_block5) function_block1.add_input(state_ph) function_block2.add_input(control_blockH) function_block2.add_input(state_ph) function_block2.add_input(function_block3) function_block3.add_input(state_ph) function_block3.add_alarm_connection(control_block1) function_block3.add_alarm_connection(control_block2) function_block4.add_input(function_block6) function_block4.add_input(reward_acc_H) function_block5.add_input(reward_ph) function_block5.add_input(function_block7) function_block6.add_input(reward_ph) function_block7.add_input(control_blockH) function_block7.add_input(function_block2) computational_graph = ComputationalGraph(blocks=blocks, model=mdp) core = HierarchicalCore(computational_graph) # Train dataset_eval_visual = list() low_level_dataset_eval1 = list() low_level_dataset_eval2 = list() n_runs = 5 for n in range(n_runs): print('ITERATION', n) core.learn(n_episodes=1000, skip=True) dataset_eval = core.evaluate(n_episodes=10) last_ep_dataset = pick_last_ep(dataset_eval) dataset_eval_visual += last_ep_dataset low_level_dataset_eval1 += control_block1.dataset.get() low_level_dataset_eval2 += control_block2.dataset.get() # Visualize hi_lev_params = agentH.Q.get_weights() hi_lev_params = np.reshape(hi_lev_params, (8, 25)) max_q_val = np.zeros(shape=(25, )) act_max_q_val = np.zeros(shape=(25, )) for i in range(25): max_q_val[i] = np.amax(hi_lev_params[:, i]) act_max_q_val[i] = np.argmax(hi_lev_params[:, i]) max_q_val_tiled = np.reshape(max_q_val, (5, 5)) act_max_q_val_tiled = np.reshape(act_max_q_val, (5, 5)) #low_level_dataset1 = dataset_callback1.get() #low_level_dataset2 = dataset_callback2.get() subdir = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '/' mk_dir_recursive('./' + subdir) np.save(subdir + '/low_level_dataset1_file', low_level_dataset_eval1) np.save(subdir + '/low_level_dataset2_file', low_level_dataset_eval2) np.save(subdir + '/max_q_val_tiled_file', max_q_val_tiled) np.save(subdir + '/act_max_q_val_tiled_file', act_max_q_val_tiled) np.save(subdir + '/dataset_eval_file', dataset_eval_visual) return