示例#1
0
def test_lspi():
    mdp = CartPole()
    np.random.seed(1)

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    basis = [PolynomialBasis()]
    features = Features(basis_list=basis)

    approximator_params = dict(input_shape=(features.size, ),
                               output_shape=(mdp.info.action_space.n, ),
                               n_actions=mdp.info.action_space.n)
    agent = LSPI(pi,
                 mdp.info,
                 fit_params=dict(),
                 approximator_params=approximator_params,
                 features=features)

    # Algorithm
    core = Core(agent, mdp)

    # Train
    core.learn(n_episodes=100, n_episodes_per_fit=100)

    w = agent.approximator.get_weights()
    w_test = np.array([-2.23880597, -2.27427603, -2.25])

    assert np.allclose(w, w_test)
示例#2
0
def build_low_level_agent(alg, params, mdp):
    features = Features(
        basis_list=[PolynomialBasis(dimensions=[0], degrees=[1])])

    pi = DeterministicControlPolicy(weights=np.array([0]))
    mu = np.zeros(pi.weights_size)
    sigma = 1e-3 * np.ones(pi.weights_size)
    distribution = GaussianDiagonalDistribution(mu, sigma)

    mdp_info_agent2 = MDPInfo(observation_space=spaces.Box(
        -np.pi, np.pi, (1, )),
                              action_space=mdp.info.action_space,
                              gamma=mdp.info.gamma,
                              horizon=100)
    agent = alg(distribution, pi, mdp_info_agent2, features=features, **params)

    return agent
示例#3
0
def experiment():
    np.random.seed()

    # MDP
    mdp = CartPole()

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    basis = [PolynomialBasis()]

    s1 = np.array([-np.pi, 0, np.pi]) * .25
    s2 = np.array([-1, 0, 1])
    for i in s1:
        for j in s2:
            basis.append(GaussianRBF(np.array([i, j]), np.array([1.])))
    features = Features(basis_list=basis)

    fit_params = dict()
    approximator_params = dict(input_shape=(features.size, ),
                               output_shape=(mdp.info.action_space.n, ),
                               n_actions=mdp.info.action_space.n)
    agent = LSPI(pi,
                 mdp.info,
                 fit_params=fit_params,
                 approximator_params=approximator_params,
                 features=features)

    # Algorithm
    core = Core(agent, mdp)
    core.evaluate(n_episodes=3, render=True)

    # Train
    core.learn(n_episodes=100, n_episodes_per_fit=100)

    # Test
    test_epsilon = Parameter(0.)
    agent.policy.set_epsilon(test_epsilon)

    dataset = core.evaluate(n_episodes=1, quiet=True)

    core.evaluate(n_steps=100, render=True)

    return np.mean(episodes_length(dataset))
示例#4
0
def experiment(n_epochs, ep_per_epoch_train, ep_per_epoch_eval, n_iterations):
    np.random.seed()

    # MDP
    mdp = PreyPredator()

    basis = PolynomialBasis.generate(1, mdp.info.observation_space.shape[0])
    phi = Features(basis_list=basis[1:])

    # Features
    approximator = Regressor(LinearApproximator,
                             input_shape=(phi.size, ),
                             output_shape=mdp.info.action_space.shape)

    sigma = 1e-2 * np.eye(mdp.info.action_space.shape[0])
    policy = GaussianPolicy(approximator, sigma)

    lr = Parameter(1e-5)
    #agent = GPOMDP(policy, mdp.info, lr, phi)
    agent = KeyboardAgent()

    # Train
    core = Core(agent, mdp)
    dataset = core.evaluate(n_episodes=ep_per_epoch_eval, render=True)
    J = compute_J(dataset, gamma=mdp.info.gamma)
    print('Reward at start: ', np.mean(J))

    for i in range(n_epochs):
        core.learn(n_episodes=ep_per_epoch_train,
                   n_episodes_per_fit=ep_per_epoch_train // n_iterations,
                   render=False)
        dataset = core.evaluate(n_episodes=ep_per_epoch_eval, render=True)
        J = compute_J(dataset, gamma=mdp.info.gamma)

        p = policy.get_weights()

        print('mu:    ', p)
        print('Reward at iteration ', i, ': ', np.mean(J))

    print('Press a button to visualize the segway...')
    input()
    core.evaluate(n_episodes=3, render=True)
示例#5
0
def build_mid_level_agent(alg, params, mdp, mu, std):
    mu_approximator = Regressor(LinearApproximator,
                                input_shape=(1, ),
                                output_shape=(2, ))

    w_mu = mu * np.ones(mu_approximator.weights_size)
    mu_approximator.set_weights(w_mu)

    pi = DiagonalGaussianPolicy(mu=mu_approximator, std=std * np.ones(2))

    lim = mdp.info.observation_space.high[0]
    basis = PolynomialBasis()
    features = BasisFeatures(basis=[basis])
    mdp_info_agent1 = MDPInfo(observation_space=spaces.Box(0, 1, (1, )),
                              action_space=spaces.Box(0, lim, (2, )),
                              gamma=1,
                              horizon=10)
    agent = alg(policy=pi,
                mdp_info=mdp_info_agent1,
                features=features,
                **params)

    return agent
示例#6
0
def experiment():

    small = True

    print('ENV IS SMALL? ', small)
    np.random.seed()

    # Model Block
    mdp = ShipSteering(small=small, hard=True, n_steps_action=3)

    #State Placeholder
    state_ph = PlaceHolder(name='state_ph')

    #Reward Placeholder
    reward_ph = PlaceHolder(name='reward_ph')

    #Last action Placeholder
    lastaction_ph = PlaceHolder(name='lastaction_ph')

    #FeaturesH
    lim = 150 if small else 1000

    tilingsH = Tiles.generate(n_tilings=1,
                              n_tiles=[5, 5],
                              low=[0, 0],
                              high=[lim, lim])
    featuresH = Features(tilings=tilingsH)

    # PolicyH
    epsilon = LinearDecayParameter(value=0.1, min_value=0.0, n=10000)
    piH = EpsGreedy(epsilon=epsilon)

    # AgentH
    learning_rate = Parameter(value=1)

    mdp_info_agentH = MDPInfo(observation_space=spaces.Box(
        low=np.array([0, 0]), high=np.array([lim, lim]), shape=(2, )),
                              action_space=spaces.Discrete(8),
                              gamma=1,
                              horizon=10000)
    approximator_paramsH = dict(input_shape=(featuresH.size, ),
                                output_shape=mdp_info_agentH.action_space.size,
                                n_actions=mdp_info_agentH.action_space.n)

    agentH = TrueOnlineSARSALambda(policy=piH,
                                   mdp_info=mdp_info_agentH,
                                   learning_rate=learning_rate,
                                   lambda_coeff=0.9,
                                   approximator_params=approximator_paramsH,
                                   features=featuresH)

    # Control Block H
    control_blockH = ControlBlock(name='control block H',
                                  agent=agentH,
                                  n_steps_per_fit=1)

    #FeaturesL
    featuresL = Features(basis_list=[PolynomialBasis()])

    # Policy1
    input_shape = (featuresL.size, )

    approximator_params = dict(input_dim=input_shape[0])
    approximator = Regressor(LinearApproximator,
                             input_shape=input_shape,
                             output_shape=mdp.info.action_space.shape,
                             **approximator_params)
    sigma = np.array([[1.3e-2]])
    pi1 = GaussianPolicy(mu=approximator, sigma=sigma)

    # Policy2
    pi2 = GaussianPolicy(mu=approximator, sigma=sigma)

    # Agent1
    learning_rate1 = AdaptiveParameter(value=1e-5)
    agent1 = GPOMDP(pi1, mdp.info, learning_rate1, featuresL)

    # Agent2
    learning_rate2 = AdaptiveParameter(value=1e-5)
    agent2 = GPOMDP(pi2, mdp.info, learning_rate2, featuresL)

    #Termination Conds
    termination_condition1 = TerminationCondition(active_dir=1, small=small)
    termination_condition2 = TerminationCondition(active_dir=5, small=small)

    # Control Block +
    control_block1 = ControlBlock(name='control block 1',
                                  agent=agent1,
                                  n_eps_per_fit=50,
                                  termination_condition=termination_condition1)

    # Control Block x
    control_block2 = ControlBlock(name='control block 2',
                                  agent=agent2,
                                  n_eps_per_fit=50,
                                  termination_condition=termination_condition2)

    # Function Block 1: picks state for hi lev ctrl
    function_block1 = fBlock(phi=pick_state, name='f1 pickstate')

    # Function Block 2: maps the env to low lev ctrl state
    function_block2 = fBlock(phi=rototranslate(small=small), name='f2 rotot')

    # Function Block 3: holds curr state as ref
    function_block3 = hold_state(name='f3 holdstate')

    # Function Block 4: adds hi lev rew
    function_block4 = addBlock(name='f4 add')

    # Function Block 5: adds low lev rew
    function_block5 = addBlock(name='f5 add')

    # Function Block 6:ext rew of hi lev ctrl
    function_block6 = fBlock(phi=G_high, name='f6 G_hi')

    # Function Block 7: ext rew of low lev ctrl
    function_block7 = fBlock(phi=G_low(small=small), name='f7 G_lo')

    #Reward Accumulator H:
    reward_acc_H = reward_accumulator_block(gamma=mdp_info_agentH.gamma,
                                            name='reward_acc_H')

    #Mux_Block
    mux_block = MuxBlock(name='mux')
    mux_block.add_block_list([control_block1])
    mux_block.add_block_list([control_block2])

    #Algorithm
    blocks = [
        state_ph, reward_ph, lastaction_ph, control_blockH, mux_block,
        function_block1, function_block2, function_block3, function_block4,
        function_block5, function_block6, function_block7, reward_acc_H
    ]

    #state_ph.add_input(mux_block)
    #reward_ph.add_input(mux_block)
    #lastaction_ph.add_input(mux_block)
    reward_acc_H.add_input(reward_ph)
    reward_acc_H.add_alarm_connection(control_block1)
    reward_acc_H.add_alarm_connection(control_block2)
    control_blockH.add_input(function_block1)
    control_blockH.add_reward(function_block4)
    control_blockH.add_alarm_connection(control_block1)
    control_blockH.add_alarm_connection(control_block2)
    mux_block.add_input(control_blockH)
    mux_block.add_input(function_block2)
    control_block1.add_reward(function_block5)
    control_block2.add_reward(function_block5)
    function_block1.add_input(state_ph)
    function_block2.add_input(control_blockH)
    function_block2.add_input(state_ph)
    function_block2.add_input(function_block3)
    function_block3.add_input(state_ph)
    function_block3.add_alarm_connection(control_block1)
    function_block3.add_alarm_connection(control_block2)
    function_block4.add_input(function_block6)
    function_block4.add_input(reward_acc_H)
    function_block5.add_input(reward_ph)
    function_block5.add_input(function_block7)
    function_block6.add_input(reward_ph)
    function_block7.add_input(control_blockH)
    function_block7.add_input(function_block2)

    computational_graph = ComputationalGraph(blocks=blocks, model=mdp)
    core = HierarchicalCore(computational_graph)

    # Train
    dataset_eval_visual = list()
    low_level_dataset_eval1 = list()
    low_level_dataset_eval2 = list()

    n_runs = 5
    for n in range(n_runs):
        print('ITERATION', n)
        core.learn(n_episodes=1000, skip=True)
        dataset_eval = core.evaluate(n_episodes=10)
        last_ep_dataset = pick_last_ep(dataset_eval)
        dataset_eval_visual += last_ep_dataset
        low_level_dataset_eval1 += control_block1.dataset.get()
        low_level_dataset_eval2 += control_block2.dataset.get()

    # Visualize
    hi_lev_params = agentH.Q.get_weights()
    hi_lev_params = np.reshape(hi_lev_params, (8, 25))
    max_q_val = np.zeros(shape=(25, ))
    act_max_q_val = np.zeros(shape=(25, ))
    for i in range(25):
        max_q_val[i] = np.amax(hi_lev_params[:, i])
        act_max_q_val[i] = np.argmax(hi_lev_params[:, i])
    max_q_val_tiled = np.reshape(max_q_val, (5, 5))
    act_max_q_val_tiled = np.reshape(act_max_q_val, (5, 5))
    #low_level_dataset1 = dataset_callback1.get()
    #low_level_dataset2 = dataset_callback2.get()

    subdir = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '/'
    mk_dir_recursive('./' + subdir)

    np.save(subdir + '/low_level_dataset1_file', low_level_dataset_eval1)
    np.save(subdir + '/low_level_dataset2_file', low_level_dataset_eval2)
    np.save(subdir + '/max_q_val_tiled_file', max_q_val_tiled)
    np.save(subdir + '/act_max_q_val_tiled_file', act_max_q_val_tiled)
    np.save(subdir + '/dataset_eval_file', dataset_eval_visual)

    return