Exemplo n.º 1
0
def server_experiment_small(alg_high, alg_low, params, subdir, i):

    np.random.seed()

    # Model Block
    mdp = ShipSteering(small=False, n_steps_action=3)

    #State Placeholder
    state_ph = PlaceHolder(name='state_ph')

    #Reward Placeholder
    reward_ph = PlaceHolder(name='reward_ph')

    #Last_In Placeholder
    lastaction_ph = PlaceHolder(name='lastaction_ph')

    # Function Block 1
    function_block1 = fBlock(name='f1 (angle difference)',
                             phi=pos_ref_angle_difference)

    # Function Block 2
    function_block2 = fBlock(name='f2 (cost cosine)', phi=cost_cosine)

    #Features
    features = Features(basis_list=[PolynomialBasis()])

    # Policy 1
    sigma1 = np.array([255, 255])
    approximator1 = Regressor(LinearApproximator,
                              input_shape=(features.size, ),
                              output_shape=(2, ))
    approximator1.set_weights(np.array([500, 500]))

    pi1 = DiagonalGaussianPolicy(mu=approximator1, std=sigma1)

    # Policy 2
    pi2 = DeterministicControlPolicy(weights=np.array([0]))
    mu2 = np.zeros(pi2.weights_size)
    sigma2 = 1e-3 * np.ones(pi2.weights_size)
    distribution2 = GaussianDiagonalDistribution(mu2, sigma2)

    # Agent 1
    learning_rate1 = params.get('learning_rate_high')
    lim = 1000
    mdp_info_agent1 = MDPInfo(observation_space=mdp.info.observation_space,
                              action_space=spaces.Box(0, lim, (2, )),
                              gamma=mdp.info.gamma,
                              horizon=100)
    agent1 = alg_high(policy=pi1,
                      mdp_info=mdp_info_agent1,
                      learning_rate=learning_rate1,
                      features=features)

    # Agent 2
    learning_rate2 = params.get('learning_rate_low')
    mdp_info_agent2 = MDPInfo(observation_space=spaces.Box(
        -np.pi, np.pi, (1, )),
                              action_space=mdp.info.action_space,
                              gamma=mdp.info.gamma,
                              horizon=100)
    agent2 = alg_low(distribution=distribution2,
                     policy=pi2,
                     mdp_info=mdp_info_agent2,
                     learning_rate=learning_rate2)

    # Control Block 1
    parameter_callback1 = CollectPolicyParameter(pi1)
    control_block1 = ControlBlock(name='Control Block 1',
                                  agent=agent1,
                                  n_eps_per_fit=ep_per_run,
                                  callbacks=[parameter_callback1])

    # Control Block 2
    parameter_callback2 = CollectDistributionParameter(distribution2)
    control_block2 = ControlBlock(name='Control Block 2',
                                  agent=agent2,
                                  n_eps_per_fit=10,
                                  callbacks=[parameter_callback2])

    #Reward Accumulator
    reward_acc = reward_accumulator_block(gamma=mdp_info_agent1.gamma,
                                          name='reward_acc')

    # Algorithm
    blocks = [
        state_ph, reward_ph, lastaction_ph, control_block1, control_block2,
        function_block1, function_block2, reward_acc
    ]

    state_ph.add_input(control_block2)
    reward_ph.add_input(control_block2)
    lastaction_ph.add_input(control_block2)
    control_block1.add_input(state_ph)
    reward_acc.add_input(reward_ph)
    reward_acc.add_alarm_connection(control_block2)
    control_block1.add_reward(reward_acc)
    control_block1.add_alarm_connection(control_block2)
    function_block1.add_input(control_block1)
    function_block1.add_input(state_ph)
    function_block2.add_input(function_block1)

    control_block2.add_input(function_block1)
    control_block2.add_reward(function_block2)
    computational_graph = ComputationalGraph(blocks=blocks, model=mdp)
    core = HierarchicalCore(computational_graph)

    # Train
    low_level_dataset_eval = list()
    dataset_eval = list()

    dataset_eval_run = core.evaluate(n_episodes=eval_run)
    J = compute_J(dataset_eval_run, gamma=mdp.info.gamma)
    print('J at start : ' + str(np.mean(J)))
    dataset_eval += dataset_eval_run

    for n in range(n_runs):
        print('ITERATION', n)
        core.learn(n_episodes=n_iterations * ep_per_run, skip=True)
        dataset_eval_run = core.evaluate(n_episodes=eval_run)
        dataset_eval += dataset_eval_run
        J = compute_J(dataset_eval_run, gamma=mdp.info.gamma)
        print('J at iteration ' + str(n) + ': ' + str(np.mean(J)))
        low_level_dataset_eval += control_block2.dataset.get()

    # Save
    parameter_dataset1 = parameter_callback1.get_values()
    parameter_dataset2 = parameter_callback2.get_values()

    mk_dir_recursive('./' + subdir + str(i))

    np.save(subdir + str(i) + '/low_level_dataset_file',
            low_level_dataset_eval)
    np.save(subdir + str(i) + '/parameter_dataset1_file', parameter_dataset1)
    np.save(subdir + str(i) + '/parameter_dataset2_file', parameter_dataset2)
    np.save(subdir + str(i) + '/dataset_eval_file', dataset_eval)
Exemplo n.º 2
0
def build_computational_graph(mdp, agent_low, agent_high, ep_per_fit_low,
                              ep_per_fit_high):
    # State Placeholder
    state_ph = PlaceHolder(name='state_ph')

    # Reward Placeholder
    reward_ph = PlaceHolder(name='reward_ph')

    # Last_In Placeholder
    lastaction_ph = PlaceHolder(name='lastaction_ph')

    # Function Block 1
    function_block1 = fBlock(name='f1 (pick distance to goal state var)',
                             phi=pick_first_state)

    # Function Block 2
    function_block2 = fBlock(name='f2 (build state)',
                             phi=angle_to_angle_diff_complete_state)

    # Function Block 3
    function_block3 = fBlock(name='f3 (reward low level)', phi=lqr_cost_segway)

    # Function Block 4
    function_block4 = addBlock(name='f4 (add block)')

    # Function Block 5
    function_block5 = fBlock(name='f5 (fall punish low level)',
                             phi=fall_reward)

    # Control Block 1
    parameter_callback1 = CollectDistributionParameter(agent_high.distribution)
    control_block_h = ControlBlock(name='Control Block High',
                                   agent=agent_high,
                                   n_eps_per_fit=ep_per_fit_high,
                                   callbacks=[parameter_callback1])

    # Control Block 2
    parameter_callback2 = CollectDistributionParameter(agent_low.distribution)
    control_block_l = ControlBlock(name='Control Block Low',
                                   agent=agent_low,
                                   n_eps_per_fit=ep_per_fit_low,
                                   callbacks=[parameter_callback2])
    control_block_h.set_mask()

    # Graph
    blocks = [
        state_ph, reward_ph, lastaction_ph, control_block_h, control_block_l,
        function_block1, function_block2, function_block3, function_block4,
        function_block5
    ]

    state_ph.add_input(control_block_l)
    reward_ph.add_input(control_block_l)
    lastaction_ph.add_input(control_block_l)
    control_block_h.add_input(function_block1)
    control_block_h.add_reward(reward_ph)
    control_block_l.add_input(function_block2)
    control_block_l.add_reward(function_block4)
    function_block1.add_input(state_ph)
    function_block2.add_input(control_block_h)

    function_block2.add_input(state_ph)
    function_block3.add_input(function_block2)
    function_block5.add_input(state_ph)
    function_block4.add_input(function_block3)
    function_block4.add_input(function_block5)
    computational_graph = ComputationalGraph(blocks=blocks, model=mdp)

    return computational_graph, control_block_h
Exemplo n.º 3
0
def segway_experiment(alg_high, alg_low, params_high, params_low):

    np.random.seed()

    # Model Block
    mdp = SegwayLinearMotion(goal_distance=1.0)

    #State Placeholder
    state_ph = PlaceHolder(name='state_ph')

    #Reward Placeholder
    reward_ph = PlaceHolder(name='reward_ph')

    #Last_In Placeholder
    lastaction_ph = PlaceHolder(name='lastaction_ph')

    # Function Block 1
    function_block1 = fBlock(name='f1 (pick distance to goal state var)',
                             phi=pick_first_state)

    # Function Block 2
    function_block2 = fBlock(name='f2 (build state)',
                             phi=angle_to_angle_diff_complete_state)

    # Function Block 3
    function_block3 = fBlock(name='f3 (reward low level)',
                             phi=lqr_cost_segway)

    # Function Block 4
    function_block4 = addBlock(name='f4 (add block)')

    # Function Block 5
    function_block5 = fBlock(name='f5 (fall punish low level)', phi=fall_reward)


    # Features
    approximator1 = Regressor(LinearApproximator,
                             input_shape=(1,),
                             output_shape=(1,))

    # Policy H
    n_weights = approximator1.weights_size
    mu1 = np.zeros(n_weights)
    sigma1 = 2.0e-2*np.ones(n_weights)
    pi1 = DeterministicPolicy(approximator1)
    dist1 = GaussianDiagonalDistribution(mu1, sigma1)


    # Agent H
    lim = np.pi/2
    mdp_info_agent1 = MDPInfo(observation_space=mdp.info.observation_space,
                              action_space=spaces.Box(-lim, lim, (1,)),
                              gamma=mdp.info.gamma,
                              horizon=mdp.info.horizon)
    agent_high = alg_high(dist1, pi1, mdp_info_agent1, **params_high)

    # Policy L
    approximator2 = Regressor(LinearApproximator,
                              input_shape=(3,),
                              output_shape=(1,))
    n_weights2 = approximator2.weights_size
    mu2 = np.zeros(n_weights2)
    sigma2 = 2.0*np.ones(n_weights2)
    pi2 = DeterministicControlPolicy(approximator2)
    dist2 = GaussianDiagonalDistribution(mu2, sigma2)

    # Agent Low
    mdp_info_agent2 = MDPInfo(observation_space=spaces.Box(
        low=mdp.info.observation_space.low[1:], #FIXME FALSE
        high=mdp.info.observation_space.high[1:], #FIXME FALSE
        shape=(3,)),
        action_space=mdp.info.action_space,
        gamma=mdp.info.gamma, horizon=mdp.info.horizon)

    agent_low = alg_low(dist2, pi2, mdp_info_agent2, **params_low)

    # Control Block 1
    parameter_callback1 = CollectDistributionParameter(dist1)
    control_block1 = ControlBlock(name='Control Block High', agent=agent_high,
                                  n_eps_per_fit=n_ep_per_fit*2,
                                  callbacks=[parameter_callback1])

    # Control Block 2
    parameter_callback2 = CollectDistributionParameter(dist2)
    control_block2 = ControlBlock(name='Control Block Low', agent=agent_low,
                                  n_eps_per_fit=n_ep_per_fit,
                                  callbacks=[parameter_callback2])
    control_block1.set_mask()

    # Algorithm
    blocks = [state_ph, reward_ph, lastaction_ph, control_block1,
              control_block2, function_block1, function_block2,
              function_block3, function_block4, function_block5]

    state_ph.add_input(control_block2)
    reward_ph.add_input(control_block2)
    lastaction_ph.add_input(control_block2)
    control_block1.add_input(function_block1)
    control_block1.add_reward(reward_ph)
    control_block2.add_input(function_block2)
    control_block2.add_reward(function_block4)
    function_block1.add_input(state_ph)
    function_block2.add_input(control_block1)

    function_block2.add_input(state_ph)
    function_block3.add_input(function_block2)
    function_block5.add_input(state_ph)
    function_block4.add_input(function_block3)
    function_block4.add_input(function_block5)
    computational_graph = ComputationalGraph(blocks=blocks, model=mdp)
    core = HierarchicalCore(computational_graph)

    # Train
    dataset_eval_run = core.evaluate(n_episodes=eval_run, render=False)
    J = compute_J(dataset_eval_run, gamma=mdp.info.gamma)
    print('J at start : ' + str(np.mean(J)))

    mask_done = False
    for n in range(n_epochs):
        print('ITERATION', n)

        if n == 2:
            control_block1.unset_mask()
        core.learn(n_episodes=n_iterations*n_ep_per_fit, skip=True)
        dataset_eval_run = core.evaluate(n_episodes=eval_run, render=False)
        J = compute_J(dataset_eval_run, gamma=mdp.info.gamma)
        print('J at iteration ' + str(n) + ': ' + str(np.mean(J)))
        print('dist H:', dist1.get_parameters())
        print('dist L mu:', dist2.get_parameters()[:3])
        print('dist L sigma:', dist2.get_parameters()[3:])
Exemplo n.º 4
0
def server_experiment_small(alg_high, alg_low, params, subdir, i):

    np.random.seed()

    # Model Block
    mdp = SegwayLinearMotion(goal_distance=1.0)

    #State Placeholder
    state_ph = PlaceHolder(name='state_ph')

    #Reward Placeholder
    reward_ph = PlaceHolder(name='reward_ph')

    #Last_In Placeholder
    lastaction_ph = PlaceHolder(name='lastaction_ph')

    # Function Block 1
    function_block1 = fBlock(name='f1 (pick distance to goal state var)',
                             phi=pick_first_state)

    # Function Block 2
    function_block2 = fBlock(name='f2 (build state)',
                             phi=angle_to_angle_diff_complete_state)

    # Function Block 3
    function_block3 = fBlock(name='f3 (reward low level)', phi=lqr_cost_segway)

    # Function Block 4
    function_block4 = addBlock(name='f4 (add block)')

    # Integrator Block
    error_acc = ErrorAccumulatorBlock(name='error acc')

    # Features
    features1 = Features(basis_list=[PolynomialBasis()])
    approximator1 = Regressor(LinearApproximator,
                              input_shape=(features1.size, ),
                              output_shape=(1, ))

    # Policy 1
    n_weights = approximator1.weights_size
    mu1 = np.zeros(n_weights)
    sigma1 = 2e-0 * np.ones(n_weights)
    pi1 = DeterministicPolicy(approximator1)
    dist1 = GaussianDiagonalDistribution(mu1, sigma1)

    # Agent 1
    eps1 = params.get('eps')
    lim = 2 * np.pi
    mdp_info_agent1 = MDPInfo(observation_space=mdp.info.observation_space,
                              action_space=spaces.Box(0, lim, (1, )),
                              gamma=mdp.info.gamma,
                              horizon=20)

    agent1 = alg_low(distribution=dist1,
                     policy=pi1,
                     features=features1,
                     mdp_info=mdp_info_agent1,
                     eps=eps1)

    # Policy 2
    basis = PolynomialBasis.generate(1, 3)
    features2 = Features(basis_list=basis)
    approximator2 = Regressor(LinearApproximator,
                              input_shape=(features2.size, ),
                              output_shape=(1, ))
    n_weights2 = approximator2.weights_size
    mu2 = np.zeros(n_weights2)
    sigma2 = 2e-0 * np.ones(n_weights2)
    pi2 = DeterministicPolicy(approximator2)
    dist2 = GaussianDiagonalDistribution(mu2, sigma2)

    # Agent 2
    mdp_info_agent2 = MDPInfo(observation_space=spaces.Box(
        low=np.array([-np.pi, -np.pi, -np.pi]),
        high=np.array([np.pi, np.pi, np.pi]),
        shape=(3, )),
                              action_space=mdp.info.action_space,
                              gamma=mdp.info.gamma,
                              horizon=30)

    agent2 = alg_low(distribution=dist2,
                     policy=pi2,
                     features=features2,
                     mdp_info=mdp_info_agent2,
                     eps=eps1)

    # Control Block 1
    parameter_callback1 = CollectDistributionParameter(dist1)
    control_block1 = ControlBlock(name='Control Block 1',
                                  agent=agent1,
                                  n_eps_per_fit=ep_per_run,
                                  callbacks=[parameter_callback1])

    # Control Block 2
    parameter_callback2 = CollectDistributionParameter(dist2)
    control_block2 = ControlBlock(name='Control Block 2',
                                  agent=agent2,
                                  n_eps_per_fit=20,
                                  callbacks=[parameter_callback2])

    #Reward Accumulator
    reward_acc = reward_accumulator_block(gamma=mdp_info_agent1.gamma,
                                          name='reward_acc')

    # Algorithm
    blocks = [
        state_ph, reward_ph, lastaction_ph, control_block1, control_block2,
        function_block1, function_block2, function_block3, function_block4,
        error_acc, reward_acc
    ]

    state_ph.add_input(control_block2)
    reward_ph.add_input(control_block2)
    lastaction_ph.add_input(control_block2)
    reward_acc.add_input(reward_ph)
    reward_acc.add_alarm_connection(control_block2)
    control_block1.add_input(function_block1)
    control_block1.add_reward(reward_acc)
    control_block1.add_alarm_connection(control_block2)
    control_block2.add_input(function_block2)
    control_block2.add_reward(function_block4)
    function_block1.add_input(state_ph)
    function_block2.add_input(control_block1)
    function_block2.add_input(state_ph)
    function_block3.add_input(function_block2)
    error_acc.add_input(function_block3)
    error_acc.add_alarm_connection(control_block2)
    function_block4.add_input(function_block3)
    function_block4.add_input(error_acc)
    computational_graph = ComputationalGraph(blocks=blocks, model=mdp)
    core = HierarchicalCore(computational_graph)

    # Train
    low_level_dataset_eval = list()
    dataset_eval = list()

    dataset_eval_run = core.evaluate(n_episodes=eval_run, render=True)
    J = compute_J(dataset_eval_run, gamma=mdp.info.gamma)
    print('J at start : ' + str(np.mean(J)))
    dataset_eval += dataset_eval_run

    for n in range(n_runs):
        print('ITERATION', n)
        core.learn(n_episodes=n_iterations * ep_per_run, skip=True)
        dataset_eval_run = core.evaluate(n_episodes=eval_run, render=True)
        dataset_eval += dataset_eval_run
        J = compute_J(dataset_eval_run, gamma=mdp.info.gamma)
        print('J at iteration ' + str(n) + ': ' + str(np.mean(J)))