Exemplo n.º 1
0
    def __init__(self, p, rew, mu=None, gamma=.9, horizon=np.inf):
        """
        Constructor.

        Args:
            p (np.ndarray): transition probability matrix;
            rew (np.ndarray): reward matrix;
            mu (np.ndarray, None): initial state probability distribution;
            gamma (float, .9): discount factor;
            horizon (int, np.inf): the horizon.

        """
        assert p.shape == rew.shape
        assert mu is None or p.shape[0] == mu.size

        # MDP parameters
        self.p = p
        self.r = rew
        self.mu = mu

        # MDP properties
        observation_space = spaces.Discrete(p.shape[0])
        action_space = spaces.Discrete(p.shape[1])
        horizon = horizon
        gamma = gamma
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        super().__init__(mdp_info)
def experiment():
    np.random.seed(3)
    # MDP
    mdp = generate_simple_chain(state_n=5,
                                goal_states=[2],
                                prob=.8,
                                rew=1,
                                gamma=.9)

    action_space = mdp._mdp_info.action_space
    observation_space = mdp._mdp_info.observation_space
    gamma = mdp._mdp_info.gamma

    # Model Block
    model_block = MBlock(env=mdp, render=False)

    #Policy
    epsilon = Parameter(value=1)
    pi = EpsGreedy(epsilon=epsilon)
    table = Table(mdp.info.size)
    pi.set_q(table)

    #Agents
    mdp_info_agent1 = MDPInfo(observation_space=observation_space,
                              action_space=spaces.Discrete(5),
                              gamma=1,
                              horizon=20)
    mdp_info_agent2 = MDPInfo(observation_space=spaces.Discrete(5),
                              action_space=action_space,
                              gamma=gamma,
                              horizon=10)
    agent1 = SimpleAgent(name='HIGH', mdp_info=mdp_info_agent1, policy=pi)
    agent2 = SimpleAgent(name='LOW', mdp_info=mdp_info_agent2, policy=pi)

    # Control Blocks
    control_block1 = ControlBlock(wake_time=10,
                                  agent=agent1,
                                  n_eps_per_fit=None,
                                  n_steps_per_fit=1)
    control_block2 = ControlBlock(wake_time=1,
                                  agent=agent2,
                                  n_eps_per_fit=None,
                                  n_steps_per_fit=1)

    # Algorithm
    blocks = [model_block, control_block1, control_block2]
    order = [0, 1, 2]
    model_block.add_input(control_block2)
    control_block1.add_input(model_block)
    control_block1.add_reward(model_block)
    control_block2.add_input(control_block1)
    control_block2.add_reward(model_block)
    computational_graph = ComputationalGraph(blocks=blocks, order=order)
    core = HierarchicalCore(computational_graph)

    # Train
    core.learn(n_steps=40, quiet=True)
    return
Exemplo n.º 3
0
    def __init__(self, height, width, goal, start=(0, 0)):
        # MDP properties
        observation_space = spaces.Discrete(height * width)
        action_space = spaces.Discrete(4)
        horizon = 100
        gamma = .9
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        super().__init__(mdp_info, height, width, start, goal)
Exemplo n.º 4
0
    def __init__(self, height=3, width=3, goal=(0, 2), start=(2, 0)):
        # MDP properties
        observation_space = spaces.Discrete(height * width)
        action_space = spaces.Discrete(4)
        horizon = np.inf
        gamma = .95
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        super(GridWorldVanHasselt, self).__init__(mdp_info, height, width,
                                                  start, goal)
Exemplo n.º 5
0
def build_high_level_agent(alg, params, mdp, epsilon):
    pi = EpsGreedy(epsilon=epsilon, )
    mdp_info_high = MDPInfo(observation_space=spaces.Discrete(16),
                            action_space=spaces.Discrete(4),
                            gamma=mdp.info.gamma,
                            horizon=100)

    agent = alg(pi, mdp_info_high, **params)

    return agent
def build_high_level_agent(alg, params, mdp):
    epsilon = Parameter(value=0.1)
    pi = EpsGreedy(epsilon=epsilon)
    gamma = 1.0
    mdp_info_agentH = MDPInfo(observation_space=spaces.Discrete(400),
                              action_space=spaces.Discrete(8),
                              gamma=gamma,
                              horizon=10000)

    agent = alg(policy=pi, mdp_info=mdp_info_agentH, **params)

    return agent
Exemplo n.º 7
0
    def __init__(self, grid_map):
        self.__name__ = 'GridWorldGenerator'

        self._grid, height, width, start, goal = self._generate(grid_map)

        # MDP properties
        observation_space = spaces.Discrete(height * width)
        action_space = spaces.Discrete(4)
        horizon = 100
        gamma = .9
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        super(GridWorldGenerator, self).__init__(mdp_info, height, width,
                                                 start, goal)
Exemplo n.º 8
0
    def __init__(self, grid_map_file, height_window=84, width_window=84):
        self.__name__ = 'GridWorldPixelGenerator'

        self.window_size = (width_window, height_window)

        self._symbols = {
            '.': 0.,
            'S': 63.75,
            '*': 127.5,
            '#': 191.25,
            'G': 255.
        }

        self._grid, start, goal = self._generate(grid_map_file)
        self._initial_grid = deepcopy(self._grid)
        height = self._grid.shape[0]
        width = self._grid.shape[1]

        assert height_window % height == 0 and width_window % width == 0

        # MDP properties
        observation_space = spaces.Box(low=0.,
                                       high=255.,
                                       shape=(self.window_size[1],
                                              self.window_size[0]))
        action_space = spaces.Discrete(5)
        horizon = 100
        gamma = .9
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        super(GridWorldPixelGenerator, self).__init__(mdp_info, height, width,
                                                      start, goal)
Exemplo n.º 9
0
def build_discretized_agent(alg, params, n, optim, loss, mdp, eps, n_features,
                            use_cuda):
    high = mdp.info.observation_space.high
    low = mdp.info.observation_space.low

    observation_space = spaces.Box(low=low, high=high)
    action_space = spaces.Discrete(n)

    mdp_info = MDPInfo(observation_space=observation_space,
                       action_space=action_space,
                       gamma=mdp.info.gamma,
                       horizon=mdp.info.horizon)

    pi = Boltzmann(eps)

    approximator_params = dict(network=Network,
                               optimizer=optim,
                               loss=loss,
                               n_features=n_features,
                               input_shape=mdp_info.observation_space.shape,
                               output_shape=mdp_info.action_space.size,
                               n_actions=mdp_info.action_space.n,
                               use_cuda=use_cuda)

    agent = alg(PyTorchApproximator,
                pi,
                mdp_info,
                approximator_params=approximator_params,
                **params)

    return agent
Exemplo n.º 10
0
    def __init__(self, p, rew, mu=None, gamma=.9):
        self.__name__ = 'FiniteMDP'

        assert p.shape == rew.shape
        assert mu is None or p.shape[0] == mu.size

        # MDP parameters
        self.p = p
        self.r = rew
        self.mu = mu

        # MDP properties
        observation_space = spaces.Discrete(p.shape[0])
        action_space = spaces.Discrete(p.shape[1])
        horizon = np.inf
        gamma = gamma
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        super(FiniteMDP, self).__init__(mdp_info)
Exemplo n.º 11
0
    def __init__(self,
                 m=2.,
                 M=8.,
                 l=.5,
                 g=9.8,
                 mu=1e-2,
                 max_u=50.,
                 noise_u=10.,
                 horizon=3000,
                 gamma=.95):
        """
        Constructor.

        Args:
            m (float, 2.0): mass of the pendulum;
            M (float, 8.0): mass of the cart;
            l (float, .5): length of the pendulum;
            g (float, 9.8): gravity acceleration constant;
            mu (float, 1e-2): friction constant of the pendulum;
            max_u (float, 50.): maximum allowed input torque;
            noise_u (float, 10.): maximum noise on the action;
            horizon (int, 3000): horizon of the problem;
            gamma (int, .95): discount factor.

        """
        # MDP parameters
        self._m = m
        self._M = M
        self._l = l
        self._g = g
        self._alpha = 1 / (self._m + self._M)
        self._mu = mu
        self._dt = .1
        self._max_u = max_u
        self._noise_u = noise_u
        high = np.array([np.inf, np.inf])

        # MDP properties
        observation_space = spaces.Box(low=-high, high=high)
        action_space = spaces.Discrete(3)
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        # Visualization
        self._viewer = Viewer(2.5 * l, 2.5 * l)
        self._last_u = None
        self._state = None

        super().__init__(mdp_info)
Exemplo n.º 12
0
    def __init__(self):
        self.__name__ = 'CarOnHill'

        # MDP parameters
        self.max_pos = 1.
        self.max_velocity = 3.
        high = np.array([self.max_pos, self.max_velocity])
        self._g = 9.81
        self._m = 1
        self._dt = .1
        self._discrete_actions = [-4., 4.]

        # MDP properties
        observation_space = spaces.Box(low=-high, high=high)
        action_space = spaces.Discrete(2)
        horizon = 100
        gamma = .95
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        super(CarOnHill, self).__init__(mdp_info)
Exemplo n.º 13
0
    def __init__(self, horizon=100, gamma=.95):
        """
        Constructor.

        """
        # MDP parameters
        self.max_pos = 1.
        self.max_velocity = 3.
        high = np.array([self.max_pos, self.max_velocity])
        self._g = 9.81
        self._m = 1.
        self._dt = .1
        self._discrete_actions = [-4., 4.]

        # MDP properties
        observation_space = spaces.Box(low=-high, high=high)
        action_space = spaces.Discrete(2)
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        super().__init__(mdp_info)
def build_high_level_agent(alg, params, optim, loss, mdp, horizon_low, eps,
                           n_features, use_cuda):
    high = np.ones(4)
    low = np.zeros(4)

    high[:2] = mdp.info.observation_space.high[:2]
    low[:2] = mdp.info.observation_space.low[:2]

    high[2:] = mdp.info.observation_space.high[3:5]
    low[2:] = mdp.info.observation_space.low[3:5]

    n_actions = 9
    observation_space = spaces.Box(low=low, high=high)
    action_space = spaces.Discrete(n_actions)

    mdp_info = MDPInfo(observation_space=observation_space,
                       action_space=action_space,
                       gamma=mdp.info.gamma**horizon_low,
                       horizon=mdp.info.horizon)

    pi = Boltzmann(eps)

    approximator_params = dict(network=Network,
                               optimizer=optim,
                               loss=loss,
                               n_features=n_features,
                               input_shape=mdp_info.observation_space.shape,
                               output_shape=mdp_info.action_space.size,
                               n_actions=mdp_info.action_space.n,
                               use_cuda=use_cuda)

    agent = alg(PyTorchApproximator,
                pi,
                mdp_info,
                approximator_params=approximator_params,
                **params)

    return agent
Exemplo n.º 15
0
    def __init__(self):
        self.__name__ = 'InvertedPendulum'

        # MDP parameters
        self.max_degree = np.inf
        self.max_angular_velocity = np.inf
        high = np.array([self.max_degree, self.max_angular_velocity])
        self._g = 9.8
        self._m = 2.
        self._M = 8.
        self._l = .5
        self._alpha = 1. / (self._m + self._M)
        self._dt = .1
        self._discrete_actions = [-50., 0., 50.]

        # MDP properties
        observation_space = spaces.Box(low=-high, high=high)
        action_space = spaces.Discrete(3)
        horizon = 3000
        gamma = .95
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        super(InvertedPendulum, self).__init__(mdp_info)
Exemplo n.º 16
0
def experiment_ghavamzade(alg_high, alg_low, params, subdir, i):

    np.random.seed()

    # Model Block
    mdp = ShipSteering(small=False, n_steps_action=3)

    #State Placeholder
    state_ph = PlaceHolder(name='state_ph')

    #Reward Placeholder
    reward_ph = PlaceHolder(name='reward_ph')

    #Last action Placeholder
    lastaction_ph = PlaceHolder(name='lastaction_ph')

    # FeaturesH
    low_hi = 0
    lim_hi = 1000 + 1e-8
    n_tiles_high = [20, 20]
    n_tilings = 1

    # Discretization Block
    discretization_block = DiscretizationBlock(low=low_hi,
                                               high=lim_hi,
                                               n_tiles=n_tiles_high)

    # PolicyH
    epsilon = Parameter(value=0.1)
    piH = EpsGreedy(epsilon=epsilon)

    # AgentH
    learning_rate = params.get('learning_rate_high')

    mdp_info_agentH = MDPInfo(observation_space=spaces.Discrete(
        n_tiles_high[0] * n_tiles_high[1]),
                              action_space=spaces.Discrete(8),
                              gamma=1,
                              horizon=10000)

    agentH = alg_high(policy=piH,
                      mdp_info=mdp_info_agentH,
                      learning_rate=learning_rate,
                      lambda_coeff=0.9)

    epsilon_update = EpsilonUpdate(piH)

    # Control Block H
    control_blockH = ControlBlock(name='control block H',
                                  agent=agentH,
                                  n_steps_per_fit=1)

    #FeaturesL
    high = [150, 150, np.pi]
    low = [0, 0, -np.pi]
    n_tiles = [5, 5, 10]
    low = np.array(low, dtype=np.float)
    high = np.array(high, dtype=np.float)
    n_tilings = 3

    tilingsL = Tiles.generate(n_tilings=n_tilings,
                              n_tiles=n_tiles,
                              low=low,
                              high=high)

    featuresL = Features(tilings=tilingsL)

    mdp_info_agentL = MDPInfo(observation_space=spaces.Box(
        low=np.array([0, 0]), high=np.array([150, 150]), shape=(2, )),
                              action_space=mdp.info.action_space,
                              gamma=0.99,
                              horizon=10000)

    # Approximators
    input_shape = (featuresL.size, )

    approximator_params = dict(input_dim=input_shape[0])
    approximator1 = Regressor(LinearApproximator,
                              input_shape=input_shape,
                              output_shape=mdp.info.action_space.shape,
                              **approximator_params)
    approximator2 = Regressor(LinearApproximator,
                              input_shape=input_shape,
                              output_shape=mdp.info.action_space.shape,
                              **approximator_params)

    # Policy1
    std1 = np.array([3e-2])
    pi1 = DiagonalGaussianPolicy(mu=approximator1, std=std1)

    # Policy2
    std2 = np.array([3e-2])
    pi2 = DiagonalGaussianPolicy(mu=approximator2, std=std2)

    # Agent1
    learning_rate1 = params.get('learning_rate_low')
    agent1 = alg_low(pi1, mdp_info_agentL, learning_rate1, featuresL)

    # Agent2
    learning_rate2 = params.get('learning_rate_low')
    agent2 = alg_low(pi2, mdp_info_agentL, learning_rate2, featuresL)

    #Termination Conds
    termination_condition1 = TerminationCondition(active_dir='+')
    termination_condition2 = TerminationCondition(active_dir='x')

    low_ep_per_fit = params.get('low_ep_per_fit')

    # Control Block +
    control_block_plus = ControlBlock(
        name='control block 1',
        agent=agent1,
        n_eps_per_fit=low_ep_per_fit,
        termination_condition=termination_condition1)

    # Control Block x
    control_block_cross = ControlBlock(
        name='control block 2',
        agent=agent2,
        n_eps_per_fit=low_ep_per_fit,
        termination_condition=termination_condition2)

    # Function Block 1: picks state for hi lev ctrl
    function_block1 = fBlock(phi=pick_state, name='f1 pickstate')

    # Function Block 2: maps the env to low lev ctrl state
    function_block2 = fBlock(phi=rototranslate, name='f2 rotot')

    # Function Block 3: holds curr state as ref
    function_block3 = hold_state(name='f3 holdstate')

    # Function Block 4: adds hi lev rew
    function_block4 = addBlock(name='f4 add')

    # Function Block 5: adds low lev rew
    function_block5 = addBlock(name='f5 add')

    # Function Block 6:ext rew of hi lev ctrl
    function_block6 = fBlock(phi=G_high, name='f6 G_hi')

    # Function Block 7: ext rew of low lev ctrl
    function_block7 = fBlock(phi=G_low, name='f7 G_lo')

    #Reward Accumulator H:
    reward_acc_H = reward_accumulator_block(gamma=mdp_info_agentH.gamma,
                                            name='reward_acc_H')

    # Selector Block
    function_block8 = fBlock(phi=selector_function, name='f7 G_lo')

    #Mux_Block
    mux_block = MuxBlock(name='mux')
    mux_block.add_block_list([control_block_plus])
    mux_block.add_block_list([control_block_cross])

    #Algorithm
    blocks = [
        state_ph, reward_ph, lastaction_ph, control_blockH, mux_block,
        function_block1, function_block2, function_block3, function_block4,
        function_block5, function_block6, function_block7, function_block8,
        reward_acc_H, discretization_block
    ]

    reward_acc_H.add_input(reward_ph)
    reward_acc_H.add_alarm_connection(control_block_plus)
    reward_acc_H.add_alarm_connection(control_block_cross)

    control_blockH.add_input(discretization_block)
    control_blockH.add_reward(function_block4)
    control_blockH.add_alarm_connection(control_block_plus)
    control_blockH.add_alarm_connection(control_block_cross)

    mux_block.add_input(function_block8)
    mux_block.add_input(function_block2)

    control_block_plus.add_reward(function_block5)
    control_block_cross.add_reward(function_block5)

    function_block1.add_input(state_ph)

    function_block2.add_input(control_blockH)
    function_block2.add_input(state_ph)
    function_block2.add_input(function_block3)

    function_block3.add_input(state_ph)
    function_block3.add_alarm_connection(control_block_plus)
    function_block3.add_alarm_connection(control_block_cross)

    function_block4.add_input(function_block6)
    function_block4.add_input(reward_acc_H)

    function_block5.add_input(function_block7)

    function_block6.add_input(reward_ph)

    function_block7.add_input(control_blockH)
    function_block7.add_input(function_block2)

    function_block8.add_input(control_blockH)

    discretization_block.add_input(function_block1)

    computational_graph = ComputationalGraph(blocks=blocks, model=mdp)
    core = HierarchicalCore(computational_graph)

    # Train
    low_level_dataset_eval1 = list()
    low_level_dataset_eval2 = list()
    dataset_eval = list()

    dataset_eval_run = core.evaluate(n_episodes=ep_per_run)
    # print('distribution parameters: ', distribution.get_parameters())
    J = compute_J(dataset_eval_run, gamma=mdp.info.gamma)
    dataset_eval += dataset_eval_run
    print('J at start : ' + str(np.mean(J)))
    for n in range(n_runs):
        print('ITERATION', n)

        core.learn(n_episodes=n_iterations * ep_per_run, skip=True)
        dataset_eval_run = core.evaluate(n_episodes=ep_per_run)
        J = compute_J(dataset_eval_run, gamma=mdp.info.gamma)
        print('J at iteration ' + str(n) + ': ' + str(np.mean(J)))
        dataset_eval += dataset_eval_run

        dataset_plus = control_block_plus.dataset.get()
        J_plus = compute_J(dataset_plus, mdp.info.gamma)
        dataset_cross = control_block_cross.dataset.get()
        J_cross = compute_J(dataset_cross, mdp.info.gamma)

        low_level_dataset_eval1.append(dataset_plus)
        low_level_dataset_eval2.append(dataset_cross)

        print('J ll PLUS at iteration  ' + str(n) + ': ' +
              str(np.mean(J_plus)))
        print('J ll CROSS at iteration ' + str(n) + ': ' +
              str(np.mean(J_cross)))
        if n == 4:
            control_blockH.callbacks = [epsilon_update]

    # Tile data
    hi_lev_params = agentH.Q.table
    max_q_val = np.zeros(n_tiles_high[0]**2)
    act_max_q_val = np.zeros(n_tiles_high[0]**2)
    for n in range(n_tiles_high[0]**2):
        max_q_val[n] = np.amax(hi_lev_params[n])
        act_max_q_val[n] = np.argmax(hi_lev_params[n])

    mk_dir_recursive('./' + subdir + str(i))

    np.save(subdir + str(i) + '/low_level_dataset1_file',
            low_level_dataset_eval1)
    np.save(subdir + str(i) + '/low_level_dataset2_file',
            low_level_dataset_eval2)
    np.save(subdir + str(i) + '/max_q_val_tiled_file', max_q_val)
    np.save(subdir + str(i) + '/act_max_q_val_tiled_file', act_max_q_val)
    np.save(subdir + str(i) + '/dataset_eval_file', dataset_eval)

    return
Exemplo n.º 17
0
def server_experiment_small(alg_high, alg_low, params, subdir, i):

    np.random.seed()

    # Model Block
    mdp = ShipSteering(small=True, n_steps_action=3)

    #State Placeholder
    state_ph = PlaceHolder(name='state_ph')

    #Reward Placeholder
    reward_ph = PlaceHolder(name='reward_ph')

    #Last_In Placeholder
    lastaction_ph = PlaceHolder(name='lastaction_ph')

    # Function Block 1
    function_block1 = fBlock(name='f1 (direction to angle difference)',
                             phi=direction_to_angle)

    # Function Block 2
    function_block2 = fBlock(name='f2 (cost cosine)', phi=cost_cosine)

    #Features
    features = Features(basis_list=[PolynomialBasis()])

    # Policy 1
    epsilon = LinearDecayParameter(value=0.1, min_value=0.0, n=10000)
    pi1 = EpsGreedy(epsilon=epsilon)

    # Agent 1
    learning_rate1 = params.get('learning_rate_high')
    lambda_coeff = params.get('lambda_coeff')
    mdp_info_agent1 = MDPInfo(observation_space=mdp.info.observation_space,
                              action_space=spaces.Discrete(8),
                              gamma=mdp.info.gamma,
                              horizon=100)
    approximator_params1 = dict(input_shape=(features.size, ),
                                output_shape=mdp_info_agent1.action_space.size,
                                n_actions=mdp_info_agent1.action_space.n)

    agent1 = alg_high(policy=pi1,
                      mdp_info=mdp_info_agent1,
                      learning_rate=learning_rate1,
                      lambda_coeff=lambda_coeff,
                      features=features,
                      approximator_params=approximator_params1)

    # Control Block 1
    control_block1 = ControlBlock(name='Control Block 1',
                                  agent=agent1,
                                  n_steps_per_fit=1)

    # Policy 2
    pi2 = DeterministicControlPolicy(weights=np.array([0]))
    mu2 = np.zeros(pi2.weights_size)
    sigma2 = 1e-3 * np.ones(pi2.weights_size)
    distribution2 = GaussianDiagonalDistribution(mu2, sigma2)

    # Agent 2
    learning_rate2 = params.get('learning_rate_low')
    mdp_info_agent2 = MDPInfo(observation_space=spaces.Box(
        -np.pi, np.pi, (1, )),
                              action_space=mdp.info.action_space,
                              gamma=mdp.info.gamma,
                              horizon=100)
    agent2 = alg_low(distribution=distribution2,
                     policy=pi2,
                     mdp_info=mdp_info_agent2,
                     learning_rate=learning_rate2)

    # Control Block 2
    parameter_callback2 = CollectDistributionParameter(distribution2)
    control_block2 = ControlBlock(name='Control Block 2',
                                  agent=agent2,
                                  n_eps_per_fit=10,
                                  callbacks=[parameter_callback2])

    #Reward Accumulator
    reward_acc = reward_accumulator_block(gamma=mdp_info_agent1.gamma,
                                          name='reward_acc')

    # Algorithm
    blocks = [
        state_ph, reward_ph, lastaction_ph, control_block1, control_block2,
        function_block1, function_block2, reward_acc
    ]

    state_ph.add_input(control_block2)
    reward_ph.add_input(control_block2)
    lastaction_ph.add_input(control_block2)
    control_block1.add_input(state_ph)
    reward_acc.add_input(reward_ph)
    reward_acc.add_alarm_connection(control_block2)
    control_block1.add_reward(reward_acc)
    control_block1.add_alarm_connection(control_block2)
    function_block1.add_input(control_block1)
    function_block1.add_input(state_ph)
    function_block2.add_input(function_block1)
    control_block2.add_input(function_block1)
    control_block2.add_reward(function_block2)
    computational_graph = ComputationalGraph(blocks=blocks, model=mdp)
    core = HierarchicalCore(computational_graph)

    # Train
    low_level_dataset_eval = list()
    dataset_eval = list()

    dataset_eval_run = core.evaluate(n_episodes=ep_per_run)
    J = compute_J(dataset_eval_run, gamma=mdp.info.gamma)
    print('J at start : ' + str(np.mean(J)))
    dataset_eval += dataset_eval_run

    for n in range(n_runs):
        print('ITERATION', n)
        core.learn(n_episodes=n_iterations * ep_per_run, skip=True)
        dataset_eval_run = core.evaluate(n_episodes=ep_per_run)
        dataset_eval += dataset_eval_run
        J = compute_J(dataset_eval_run, gamma=mdp.info.gamma)
        print('J at iteration ' + str(n) + ': ' + str(np.mean(J)))
        low_level_dataset_eval += control_block2.dataset.get()

    # Save
    parameter_dataset2 = parameter_callback2.get_values()
    mk_dir_recursive('./' + subdir + str(i))

    np.save(subdir + str(i) + '/low_level_dataset_file',
            low_level_dataset_eval)
    np.save(subdir + str(i) + '/parameter_dataset2_file', parameter_dataset2)
    np.save(subdir + str(i) + '/dataset_eval_file', dataset_eval)

    return
Exemplo n.º 18
0
def experiment():

    small = True

    print('ENV IS SMALL? ', small)
    np.random.seed()

    # Model Block
    mdp = ShipSteering(small=small, hard=True, n_steps_action=3)

    #State Placeholder
    state_ph = PlaceHolder(name='state_ph')

    #Reward Placeholder
    reward_ph = PlaceHolder(name='reward_ph')

    #Last action Placeholder
    lastaction_ph = PlaceHolder(name='lastaction_ph')

    #FeaturesH
    lim = 150 if small else 1000

    tilingsH = Tiles.generate(n_tilings=1,
                              n_tiles=[5, 5],
                              low=[0, 0],
                              high=[lim, lim])
    featuresH = Features(tilings=tilingsH)

    # PolicyH
    epsilon = LinearDecayParameter(value=0.1, min_value=0.0, n=10000)
    piH = EpsGreedy(epsilon=epsilon)

    # AgentH
    learning_rate = Parameter(value=1)

    mdp_info_agentH = MDPInfo(observation_space=spaces.Box(
        low=np.array([0, 0]), high=np.array([lim, lim]), shape=(2, )),
                              action_space=spaces.Discrete(8),
                              gamma=1,
                              horizon=10000)
    approximator_paramsH = dict(input_shape=(featuresH.size, ),
                                output_shape=mdp_info_agentH.action_space.size,
                                n_actions=mdp_info_agentH.action_space.n)

    agentH = TrueOnlineSARSALambda(policy=piH,
                                   mdp_info=mdp_info_agentH,
                                   learning_rate=learning_rate,
                                   lambda_coeff=0.9,
                                   approximator_params=approximator_paramsH,
                                   features=featuresH)

    # Control Block H
    control_blockH = ControlBlock(name='control block H',
                                  agent=agentH,
                                  n_steps_per_fit=1)

    #FeaturesL
    featuresL = Features(basis_list=[PolynomialBasis()])

    # Policy1
    input_shape = (featuresL.size, )

    approximator_params = dict(input_dim=input_shape[0])
    approximator = Regressor(LinearApproximator,
                             input_shape=input_shape,
                             output_shape=mdp.info.action_space.shape,
                             **approximator_params)
    sigma = np.array([[1.3e-2]])
    pi1 = GaussianPolicy(mu=approximator, sigma=sigma)

    # Policy2
    pi2 = GaussianPolicy(mu=approximator, sigma=sigma)

    # Agent1
    learning_rate1 = AdaptiveParameter(value=1e-5)
    agent1 = GPOMDP(pi1, mdp.info, learning_rate1, featuresL)

    # Agent2
    learning_rate2 = AdaptiveParameter(value=1e-5)
    agent2 = GPOMDP(pi2, mdp.info, learning_rate2, featuresL)

    #Termination Conds
    termination_condition1 = TerminationCondition(active_dir=1, small=small)
    termination_condition2 = TerminationCondition(active_dir=5, small=small)

    # Control Block +
    control_block1 = ControlBlock(name='control block 1',
                                  agent=agent1,
                                  n_eps_per_fit=50,
                                  termination_condition=termination_condition1)

    # Control Block x
    control_block2 = ControlBlock(name='control block 2',
                                  agent=agent2,
                                  n_eps_per_fit=50,
                                  termination_condition=termination_condition2)

    # Function Block 1: picks state for hi lev ctrl
    function_block1 = fBlock(phi=pick_state, name='f1 pickstate')

    # Function Block 2: maps the env to low lev ctrl state
    function_block2 = fBlock(phi=rototranslate(small=small), name='f2 rotot')

    # Function Block 3: holds curr state as ref
    function_block3 = hold_state(name='f3 holdstate')

    # Function Block 4: adds hi lev rew
    function_block4 = addBlock(name='f4 add')

    # Function Block 5: adds low lev rew
    function_block5 = addBlock(name='f5 add')

    # Function Block 6:ext rew of hi lev ctrl
    function_block6 = fBlock(phi=G_high, name='f6 G_hi')

    # Function Block 7: ext rew of low lev ctrl
    function_block7 = fBlock(phi=G_low(small=small), name='f7 G_lo')

    #Reward Accumulator H:
    reward_acc_H = reward_accumulator_block(gamma=mdp_info_agentH.gamma,
                                            name='reward_acc_H')

    #Mux_Block
    mux_block = MuxBlock(name='mux')
    mux_block.add_block_list([control_block1])
    mux_block.add_block_list([control_block2])

    #Algorithm
    blocks = [
        state_ph, reward_ph, lastaction_ph, control_blockH, mux_block,
        function_block1, function_block2, function_block3, function_block4,
        function_block5, function_block6, function_block7, reward_acc_H
    ]

    #state_ph.add_input(mux_block)
    #reward_ph.add_input(mux_block)
    #lastaction_ph.add_input(mux_block)
    reward_acc_H.add_input(reward_ph)
    reward_acc_H.add_alarm_connection(control_block1)
    reward_acc_H.add_alarm_connection(control_block2)
    control_blockH.add_input(function_block1)
    control_blockH.add_reward(function_block4)
    control_blockH.add_alarm_connection(control_block1)
    control_blockH.add_alarm_connection(control_block2)
    mux_block.add_input(control_blockH)
    mux_block.add_input(function_block2)
    control_block1.add_reward(function_block5)
    control_block2.add_reward(function_block5)
    function_block1.add_input(state_ph)
    function_block2.add_input(control_blockH)
    function_block2.add_input(state_ph)
    function_block2.add_input(function_block3)
    function_block3.add_input(state_ph)
    function_block3.add_alarm_connection(control_block1)
    function_block3.add_alarm_connection(control_block2)
    function_block4.add_input(function_block6)
    function_block4.add_input(reward_acc_H)
    function_block5.add_input(reward_ph)
    function_block5.add_input(function_block7)
    function_block6.add_input(reward_ph)
    function_block7.add_input(control_blockH)
    function_block7.add_input(function_block2)

    computational_graph = ComputationalGraph(blocks=blocks, model=mdp)
    core = HierarchicalCore(computational_graph)

    # Train
    dataset_eval_visual = list()
    low_level_dataset_eval1 = list()
    low_level_dataset_eval2 = list()

    n_runs = 5
    for n in range(n_runs):
        print('ITERATION', n)
        core.learn(n_episodes=1000, skip=True)
        dataset_eval = core.evaluate(n_episodes=10)
        last_ep_dataset = pick_last_ep(dataset_eval)
        dataset_eval_visual += last_ep_dataset
        low_level_dataset_eval1 += control_block1.dataset.get()
        low_level_dataset_eval2 += control_block2.dataset.get()

    # Visualize
    hi_lev_params = agentH.Q.get_weights()
    hi_lev_params = np.reshape(hi_lev_params, (8, 25))
    max_q_val = np.zeros(shape=(25, ))
    act_max_q_val = np.zeros(shape=(25, ))
    for i in range(25):
        max_q_val[i] = np.amax(hi_lev_params[:, i])
        act_max_q_val[i] = np.argmax(hi_lev_params[:, i])
    max_q_val_tiled = np.reshape(max_q_val, (5, 5))
    act_max_q_val_tiled = np.reshape(act_max_q_val, (5, 5))
    #low_level_dataset1 = dataset_callback1.get()
    #low_level_dataset2 = dataset_callback2.get()

    subdir = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '/'
    mk_dir_recursive('./' + subdir)

    np.save(subdir + '/low_level_dataset1_file', low_level_dataset_eval1)
    np.save(subdir + '/low_level_dataset2_file', low_level_dataset_eval2)
    np.save(subdir + '/max_q_val_tiled_file', max_q_val_tiled)
    np.save(subdir + '/act_max_q_val_tiled_file', act_max_q_val_tiled)
    np.save(subdir + '/dataset_eval_file', dataset_eval_visual)

    return