def __init__(self, p, rew, mu=None, gamma=.9, horizon=np.inf): """ Constructor. Args: p (np.ndarray): transition probability matrix; rew (np.ndarray): reward matrix; mu (np.ndarray, None): initial state probability distribution; gamma (float, .9): discount factor; horizon (int, np.inf): the horizon. """ assert p.shape == rew.shape assert mu is None or p.shape[0] == mu.size # MDP parameters self.p = p self.r = rew self.mu = mu # MDP properties observation_space = spaces.Discrete(p.shape[0]) action_space = spaces.Discrete(p.shape[1]) horizon = horizon gamma = gamma mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) super().__init__(mdp_info)
def experiment(): np.random.seed(3) # MDP mdp = generate_simple_chain(state_n=5, goal_states=[2], prob=.8, rew=1, gamma=.9) action_space = mdp._mdp_info.action_space observation_space = mdp._mdp_info.observation_space gamma = mdp._mdp_info.gamma # Model Block model_block = MBlock(env=mdp, render=False) #Policy epsilon = Parameter(value=1) pi = EpsGreedy(epsilon=epsilon) table = Table(mdp.info.size) pi.set_q(table) #Agents mdp_info_agent1 = MDPInfo(observation_space=observation_space, action_space=spaces.Discrete(5), gamma=1, horizon=20) mdp_info_agent2 = MDPInfo(observation_space=spaces.Discrete(5), action_space=action_space, gamma=gamma, horizon=10) agent1 = SimpleAgent(name='HIGH', mdp_info=mdp_info_agent1, policy=pi) agent2 = SimpleAgent(name='LOW', mdp_info=mdp_info_agent2, policy=pi) # Control Blocks control_block1 = ControlBlock(wake_time=10, agent=agent1, n_eps_per_fit=None, n_steps_per_fit=1) control_block2 = ControlBlock(wake_time=1, agent=agent2, n_eps_per_fit=None, n_steps_per_fit=1) # Algorithm blocks = [model_block, control_block1, control_block2] order = [0, 1, 2] model_block.add_input(control_block2) control_block1.add_input(model_block) control_block1.add_reward(model_block) control_block2.add_input(control_block1) control_block2.add_reward(model_block) computational_graph = ComputationalGraph(blocks=blocks, order=order) core = HierarchicalCore(computational_graph) # Train core.learn(n_steps=40, quiet=True) return
def __init__(self, height, width, goal, start=(0, 0)): # MDP properties observation_space = spaces.Discrete(height * width) action_space = spaces.Discrete(4) horizon = 100 gamma = .9 mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) super().__init__(mdp_info, height, width, start, goal)
def __init__(self, height=3, width=3, goal=(0, 2), start=(2, 0)): # MDP properties observation_space = spaces.Discrete(height * width) action_space = spaces.Discrete(4) horizon = np.inf gamma = .95 mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) super(GridWorldVanHasselt, self).__init__(mdp_info, height, width, start, goal)
def build_high_level_agent(alg, params, mdp, epsilon): pi = EpsGreedy(epsilon=epsilon, ) mdp_info_high = MDPInfo(observation_space=spaces.Discrete(16), action_space=spaces.Discrete(4), gamma=mdp.info.gamma, horizon=100) agent = alg(pi, mdp_info_high, **params) return agent
def build_high_level_agent(alg, params, mdp): epsilon = Parameter(value=0.1) pi = EpsGreedy(epsilon=epsilon) gamma = 1.0 mdp_info_agentH = MDPInfo(observation_space=spaces.Discrete(400), action_space=spaces.Discrete(8), gamma=gamma, horizon=10000) agent = alg(policy=pi, mdp_info=mdp_info_agentH, **params) return agent
def __init__(self, grid_map): self.__name__ = 'GridWorldGenerator' self._grid, height, width, start, goal = self._generate(grid_map) # MDP properties observation_space = spaces.Discrete(height * width) action_space = spaces.Discrete(4) horizon = 100 gamma = .9 mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) super(GridWorldGenerator, self).__init__(mdp_info, height, width, start, goal)
def __init__(self, grid_map_file, height_window=84, width_window=84): self.__name__ = 'GridWorldPixelGenerator' self.window_size = (width_window, height_window) self._symbols = { '.': 0., 'S': 63.75, '*': 127.5, '#': 191.25, 'G': 255. } self._grid, start, goal = self._generate(grid_map_file) self._initial_grid = deepcopy(self._grid) height = self._grid.shape[0] width = self._grid.shape[1] assert height_window % height == 0 and width_window % width == 0 # MDP properties observation_space = spaces.Box(low=0., high=255., shape=(self.window_size[1], self.window_size[0])) action_space = spaces.Discrete(5) horizon = 100 gamma = .9 mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) super(GridWorldPixelGenerator, self).__init__(mdp_info, height, width, start, goal)
def build_discretized_agent(alg, params, n, optim, loss, mdp, eps, n_features, use_cuda): high = mdp.info.observation_space.high low = mdp.info.observation_space.low observation_space = spaces.Box(low=low, high=high) action_space = spaces.Discrete(n) mdp_info = MDPInfo(observation_space=observation_space, action_space=action_space, gamma=mdp.info.gamma, horizon=mdp.info.horizon) pi = Boltzmann(eps) approximator_params = dict(network=Network, optimizer=optim, loss=loss, n_features=n_features, input_shape=mdp_info.observation_space.shape, output_shape=mdp_info.action_space.size, n_actions=mdp_info.action_space.n, use_cuda=use_cuda) agent = alg(PyTorchApproximator, pi, mdp_info, approximator_params=approximator_params, **params) return agent
def __init__(self, p, rew, mu=None, gamma=.9): self.__name__ = 'FiniteMDP' assert p.shape == rew.shape assert mu is None or p.shape[0] == mu.size # MDP parameters self.p = p self.r = rew self.mu = mu # MDP properties observation_space = spaces.Discrete(p.shape[0]) action_space = spaces.Discrete(p.shape[1]) horizon = np.inf gamma = gamma mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) super(FiniteMDP, self).__init__(mdp_info)
def __init__(self, m=2., M=8., l=.5, g=9.8, mu=1e-2, max_u=50., noise_u=10., horizon=3000, gamma=.95): """ Constructor. Args: m (float, 2.0): mass of the pendulum; M (float, 8.0): mass of the cart; l (float, .5): length of the pendulum; g (float, 9.8): gravity acceleration constant; mu (float, 1e-2): friction constant of the pendulum; max_u (float, 50.): maximum allowed input torque; noise_u (float, 10.): maximum noise on the action; horizon (int, 3000): horizon of the problem; gamma (int, .95): discount factor. """ # MDP parameters self._m = m self._M = M self._l = l self._g = g self._alpha = 1 / (self._m + self._M) self._mu = mu self._dt = .1 self._max_u = max_u self._noise_u = noise_u high = np.array([np.inf, np.inf]) # MDP properties observation_space = spaces.Box(low=-high, high=high) action_space = spaces.Discrete(3) mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) # Visualization self._viewer = Viewer(2.5 * l, 2.5 * l) self._last_u = None self._state = None super().__init__(mdp_info)
def __init__(self): self.__name__ = 'CarOnHill' # MDP parameters self.max_pos = 1. self.max_velocity = 3. high = np.array([self.max_pos, self.max_velocity]) self._g = 9.81 self._m = 1 self._dt = .1 self._discrete_actions = [-4., 4.] # MDP properties observation_space = spaces.Box(low=-high, high=high) action_space = spaces.Discrete(2) horizon = 100 gamma = .95 mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) super(CarOnHill, self).__init__(mdp_info)
def __init__(self, horizon=100, gamma=.95): """ Constructor. """ # MDP parameters self.max_pos = 1. self.max_velocity = 3. high = np.array([self.max_pos, self.max_velocity]) self._g = 9.81 self._m = 1. self._dt = .1 self._discrete_actions = [-4., 4.] # MDP properties observation_space = spaces.Box(low=-high, high=high) action_space = spaces.Discrete(2) mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) super().__init__(mdp_info)
def build_high_level_agent(alg, params, optim, loss, mdp, horizon_low, eps, n_features, use_cuda): high = np.ones(4) low = np.zeros(4) high[:2] = mdp.info.observation_space.high[:2] low[:2] = mdp.info.observation_space.low[:2] high[2:] = mdp.info.observation_space.high[3:5] low[2:] = mdp.info.observation_space.low[3:5] n_actions = 9 observation_space = spaces.Box(low=low, high=high) action_space = spaces.Discrete(n_actions) mdp_info = MDPInfo(observation_space=observation_space, action_space=action_space, gamma=mdp.info.gamma**horizon_low, horizon=mdp.info.horizon) pi = Boltzmann(eps) approximator_params = dict(network=Network, optimizer=optim, loss=loss, n_features=n_features, input_shape=mdp_info.observation_space.shape, output_shape=mdp_info.action_space.size, n_actions=mdp_info.action_space.n, use_cuda=use_cuda) agent = alg(PyTorchApproximator, pi, mdp_info, approximator_params=approximator_params, **params) return agent
def __init__(self): self.__name__ = 'InvertedPendulum' # MDP parameters self.max_degree = np.inf self.max_angular_velocity = np.inf high = np.array([self.max_degree, self.max_angular_velocity]) self._g = 9.8 self._m = 2. self._M = 8. self._l = .5 self._alpha = 1. / (self._m + self._M) self._dt = .1 self._discrete_actions = [-50., 0., 50.] # MDP properties observation_space = spaces.Box(low=-high, high=high) action_space = spaces.Discrete(3) horizon = 3000 gamma = .95 mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) super(InvertedPendulum, self).__init__(mdp_info)
def experiment_ghavamzade(alg_high, alg_low, params, subdir, i): np.random.seed() # Model Block mdp = ShipSteering(small=False, n_steps_action=3) #State Placeholder state_ph = PlaceHolder(name='state_ph') #Reward Placeholder reward_ph = PlaceHolder(name='reward_ph') #Last action Placeholder lastaction_ph = PlaceHolder(name='lastaction_ph') # FeaturesH low_hi = 0 lim_hi = 1000 + 1e-8 n_tiles_high = [20, 20] n_tilings = 1 # Discretization Block discretization_block = DiscretizationBlock(low=low_hi, high=lim_hi, n_tiles=n_tiles_high) # PolicyH epsilon = Parameter(value=0.1) piH = EpsGreedy(epsilon=epsilon) # AgentH learning_rate = params.get('learning_rate_high') mdp_info_agentH = MDPInfo(observation_space=spaces.Discrete( n_tiles_high[0] * n_tiles_high[1]), action_space=spaces.Discrete(8), gamma=1, horizon=10000) agentH = alg_high(policy=piH, mdp_info=mdp_info_agentH, learning_rate=learning_rate, lambda_coeff=0.9) epsilon_update = EpsilonUpdate(piH) # Control Block H control_blockH = ControlBlock(name='control block H', agent=agentH, n_steps_per_fit=1) #FeaturesL high = [150, 150, np.pi] low = [0, 0, -np.pi] n_tiles = [5, 5, 10] low = np.array(low, dtype=np.float) high = np.array(high, dtype=np.float) n_tilings = 3 tilingsL = Tiles.generate(n_tilings=n_tilings, n_tiles=n_tiles, low=low, high=high) featuresL = Features(tilings=tilingsL) mdp_info_agentL = MDPInfo(observation_space=spaces.Box( low=np.array([0, 0]), high=np.array([150, 150]), shape=(2, )), action_space=mdp.info.action_space, gamma=0.99, horizon=10000) # Approximators input_shape = (featuresL.size, ) approximator_params = dict(input_dim=input_shape[0]) approximator1 = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape, **approximator_params) approximator2 = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape, **approximator_params) # Policy1 std1 = np.array([3e-2]) pi1 = DiagonalGaussianPolicy(mu=approximator1, std=std1) # Policy2 std2 = np.array([3e-2]) pi2 = DiagonalGaussianPolicy(mu=approximator2, std=std2) # Agent1 learning_rate1 = params.get('learning_rate_low') agent1 = alg_low(pi1, mdp_info_agentL, learning_rate1, featuresL) # Agent2 learning_rate2 = params.get('learning_rate_low') agent2 = alg_low(pi2, mdp_info_agentL, learning_rate2, featuresL) #Termination Conds termination_condition1 = TerminationCondition(active_dir='+') termination_condition2 = TerminationCondition(active_dir='x') low_ep_per_fit = params.get('low_ep_per_fit') # Control Block + control_block_plus = ControlBlock( name='control block 1', agent=agent1, n_eps_per_fit=low_ep_per_fit, termination_condition=termination_condition1) # Control Block x control_block_cross = ControlBlock( name='control block 2', agent=agent2, n_eps_per_fit=low_ep_per_fit, termination_condition=termination_condition2) # Function Block 1: picks state for hi lev ctrl function_block1 = fBlock(phi=pick_state, name='f1 pickstate') # Function Block 2: maps the env to low lev ctrl state function_block2 = fBlock(phi=rototranslate, name='f2 rotot') # Function Block 3: holds curr state as ref function_block3 = hold_state(name='f3 holdstate') # Function Block 4: adds hi lev rew function_block4 = addBlock(name='f4 add') # Function Block 5: adds low lev rew function_block5 = addBlock(name='f5 add') # Function Block 6:ext rew of hi lev ctrl function_block6 = fBlock(phi=G_high, name='f6 G_hi') # Function Block 7: ext rew of low lev ctrl function_block7 = fBlock(phi=G_low, name='f7 G_lo') #Reward Accumulator H: reward_acc_H = reward_accumulator_block(gamma=mdp_info_agentH.gamma, name='reward_acc_H') # Selector Block function_block8 = fBlock(phi=selector_function, name='f7 G_lo') #Mux_Block mux_block = MuxBlock(name='mux') mux_block.add_block_list([control_block_plus]) mux_block.add_block_list([control_block_cross]) #Algorithm blocks = [ state_ph, reward_ph, lastaction_ph, control_blockH, mux_block, function_block1, function_block2, function_block3, function_block4, function_block5, function_block6, function_block7, function_block8, reward_acc_H, discretization_block ] reward_acc_H.add_input(reward_ph) reward_acc_H.add_alarm_connection(control_block_plus) reward_acc_H.add_alarm_connection(control_block_cross) control_blockH.add_input(discretization_block) control_blockH.add_reward(function_block4) control_blockH.add_alarm_connection(control_block_plus) control_blockH.add_alarm_connection(control_block_cross) mux_block.add_input(function_block8) mux_block.add_input(function_block2) control_block_plus.add_reward(function_block5) control_block_cross.add_reward(function_block5) function_block1.add_input(state_ph) function_block2.add_input(control_blockH) function_block2.add_input(state_ph) function_block2.add_input(function_block3) function_block3.add_input(state_ph) function_block3.add_alarm_connection(control_block_plus) function_block3.add_alarm_connection(control_block_cross) function_block4.add_input(function_block6) function_block4.add_input(reward_acc_H) function_block5.add_input(function_block7) function_block6.add_input(reward_ph) function_block7.add_input(control_blockH) function_block7.add_input(function_block2) function_block8.add_input(control_blockH) discretization_block.add_input(function_block1) computational_graph = ComputationalGraph(blocks=blocks, model=mdp) core = HierarchicalCore(computational_graph) # Train low_level_dataset_eval1 = list() low_level_dataset_eval2 = list() dataset_eval = list() dataset_eval_run = core.evaluate(n_episodes=ep_per_run) # print('distribution parameters: ', distribution.get_parameters()) J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) dataset_eval += dataset_eval_run print('J at start : ' + str(np.mean(J))) for n in range(n_runs): print('ITERATION', n) core.learn(n_episodes=n_iterations * ep_per_run, skip=True) dataset_eval_run = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) print('J at iteration ' + str(n) + ': ' + str(np.mean(J))) dataset_eval += dataset_eval_run dataset_plus = control_block_plus.dataset.get() J_plus = compute_J(dataset_plus, mdp.info.gamma) dataset_cross = control_block_cross.dataset.get() J_cross = compute_J(dataset_cross, mdp.info.gamma) low_level_dataset_eval1.append(dataset_plus) low_level_dataset_eval2.append(dataset_cross) print('J ll PLUS at iteration ' + str(n) + ': ' + str(np.mean(J_plus))) print('J ll CROSS at iteration ' + str(n) + ': ' + str(np.mean(J_cross))) if n == 4: control_blockH.callbacks = [epsilon_update] # Tile data hi_lev_params = agentH.Q.table max_q_val = np.zeros(n_tiles_high[0]**2) act_max_q_val = np.zeros(n_tiles_high[0]**2) for n in range(n_tiles_high[0]**2): max_q_val[n] = np.amax(hi_lev_params[n]) act_max_q_val[n] = np.argmax(hi_lev_params[n]) mk_dir_recursive('./' + subdir + str(i)) np.save(subdir + str(i) + '/low_level_dataset1_file', low_level_dataset_eval1) np.save(subdir + str(i) + '/low_level_dataset2_file', low_level_dataset_eval2) np.save(subdir + str(i) + '/max_q_val_tiled_file', max_q_val) np.save(subdir + str(i) + '/act_max_q_val_tiled_file', act_max_q_val) np.save(subdir + str(i) + '/dataset_eval_file', dataset_eval) return
def server_experiment_small(alg_high, alg_low, params, subdir, i): np.random.seed() # Model Block mdp = ShipSteering(small=True, n_steps_action=3) #State Placeholder state_ph = PlaceHolder(name='state_ph') #Reward Placeholder reward_ph = PlaceHolder(name='reward_ph') #Last_In Placeholder lastaction_ph = PlaceHolder(name='lastaction_ph') # Function Block 1 function_block1 = fBlock(name='f1 (direction to angle difference)', phi=direction_to_angle) # Function Block 2 function_block2 = fBlock(name='f2 (cost cosine)', phi=cost_cosine) #Features features = Features(basis_list=[PolynomialBasis()]) # Policy 1 epsilon = LinearDecayParameter(value=0.1, min_value=0.0, n=10000) pi1 = EpsGreedy(epsilon=epsilon) # Agent 1 learning_rate1 = params.get('learning_rate_high') lambda_coeff = params.get('lambda_coeff') mdp_info_agent1 = MDPInfo(observation_space=mdp.info.observation_space, action_space=spaces.Discrete(8), gamma=mdp.info.gamma, horizon=100) approximator_params1 = dict(input_shape=(features.size, ), output_shape=mdp_info_agent1.action_space.size, n_actions=mdp_info_agent1.action_space.n) agent1 = alg_high(policy=pi1, mdp_info=mdp_info_agent1, learning_rate=learning_rate1, lambda_coeff=lambda_coeff, features=features, approximator_params=approximator_params1) # Control Block 1 control_block1 = ControlBlock(name='Control Block 1', agent=agent1, n_steps_per_fit=1) # Policy 2 pi2 = DeterministicControlPolicy(weights=np.array([0])) mu2 = np.zeros(pi2.weights_size) sigma2 = 1e-3 * np.ones(pi2.weights_size) distribution2 = GaussianDiagonalDistribution(mu2, sigma2) # Agent 2 learning_rate2 = params.get('learning_rate_low') mdp_info_agent2 = MDPInfo(observation_space=spaces.Box( -np.pi, np.pi, (1, )), action_space=mdp.info.action_space, gamma=mdp.info.gamma, horizon=100) agent2 = alg_low(distribution=distribution2, policy=pi2, mdp_info=mdp_info_agent2, learning_rate=learning_rate2) # Control Block 2 parameter_callback2 = CollectDistributionParameter(distribution2) control_block2 = ControlBlock(name='Control Block 2', agent=agent2, n_eps_per_fit=10, callbacks=[parameter_callback2]) #Reward Accumulator reward_acc = reward_accumulator_block(gamma=mdp_info_agent1.gamma, name='reward_acc') # Algorithm blocks = [ state_ph, reward_ph, lastaction_ph, control_block1, control_block2, function_block1, function_block2, reward_acc ] state_ph.add_input(control_block2) reward_ph.add_input(control_block2) lastaction_ph.add_input(control_block2) control_block1.add_input(state_ph) reward_acc.add_input(reward_ph) reward_acc.add_alarm_connection(control_block2) control_block1.add_reward(reward_acc) control_block1.add_alarm_connection(control_block2) function_block1.add_input(control_block1) function_block1.add_input(state_ph) function_block2.add_input(function_block1) control_block2.add_input(function_block1) control_block2.add_reward(function_block2) computational_graph = ComputationalGraph(blocks=blocks, model=mdp) core = HierarchicalCore(computational_graph) # Train low_level_dataset_eval = list() dataset_eval = list() dataset_eval_run = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) dataset_eval += dataset_eval_run for n in range(n_runs): print('ITERATION', n) core.learn(n_episodes=n_iterations * ep_per_run, skip=True) dataset_eval_run = core.evaluate(n_episodes=ep_per_run) dataset_eval += dataset_eval_run J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) print('J at iteration ' + str(n) + ': ' + str(np.mean(J))) low_level_dataset_eval += control_block2.dataset.get() # Save parameter_dataset2 = parameter_callback2.get_values() mk_dir_recursive('./' + subdir + str(i)) np.save(subdir + str(i) + '/low_level_dataset_file', low_level_dataset_eval) np.save(subdir + str(i) + '/parameter_dataset2_file', parameter_dataset2) np.save(subdir + str(i) + '/dataset_eval_file', dataset_eval) return
def experiment(): small = True print('ENV IS SMALL? ', small) np.random.seed() # Model Block mdp = ShipSteering(small=small, hard=True, n_steps_action=3) #State Placeholder state_ph = PlaceHolder(name='state_ph') #Reward Placeholder reward_ph = PlaceHolder(name='reward_ph') #Last action Placeholder lastaction_ph = PlaceHolder(name='lastaction_ph') #FeaturesH lim = 150 if small else 1000 tilingsH = Tiles.generate(n_tilings=1, n_tiles=[5, 5], low=[0, 0], high=[lim, lim]) featuresH = Features(tilings=tilingsH) # PolicyH epsilon = LinearDecayParameter(value=0.1, min_value=0.0, n=10000) piH = EpsGreedy(epsilon=epsilon) # AgentH learning_rate = Parameter(value=1) mdp_info_agentH = MDPInfo(observation_space=spaces.Box( low=np.array([0, 0]), high=np.array([lim, lim]), shape=(2, )), action_space=spaces.Discrete(8), gamma=1, horizon=10000) approximator_paramsH = dict(input_shape=(featuresH.size, ), output_shape=mdp_info_agentH.action_space.size, n_actions=mdp_info_agentH.action_space.n) agentH = TrueOnlineSARSALambda(policy=piH, mdp_info=mdp_info_agentH, learning_rate=learning_rate, lambda_coeff=0.9, approximator_params=approximator_paramsH, features=featuresH) # Control Block H control_blockH = ControlBlock(name='control block H', agent=agentH, n_steps_per_fit=1) #FeaturesL featuresL = Features(basis_list=[PolynomialBasis()]) # Policy1 input_shape = (featuresL.size, ) approximator_params = dict(input_dim=input_shape[0]) approximator = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape, **approximator_params) sigma = np.array([[1.3e-2]]) pi1 = GaussianPolicy(mu=approximator, sigma=sigma) # Policy2 pi2 = GaussianPolicy(mu=approximator, sigma=sigma) # Agent1 learning_rate1 = AdaptiveParameter(value=1e-5) agent1 = GPOMDP(pi1, mdp.info, learning_rate1, featuresL) # Agent2 learning_rate2 = AdaptiveParameter(value=1e-5) agent2 = GPOMDP(pi2, mdp.info, learning_rate2, featuresL) #Termination Conds termination_condition1 = TerminationCondition(active_dir=1, small=small) termination_condition2 = TerminationCondition(active_dir=5, small=small) # Control Block + control_block1 = ControlBlock(name='control block 1', agent=agent1, n_eps_per_fit=50, termination_condition=termination_condition1) # Control Block x control_block2 = ControlBlock(name='control block 2', agent=agent2, n_eps_per_fit=50, termination_condition=termination_condition2) # Function Block 1: picks state for hi lev ctrl function_block1 = fBlock(phi=pick_state, name='f1 pickstate') # Function Block 2: maps the env to low lev ctrl state function_block2 = fBlock(phi=rototranslate(small=small), name='f2 rotot') # Function Block 3: holds curr state as ref function_block3 = hold_state(name='f3 holdstate') # Function Block 4: adds hi lev rew function_block4 = addBlock(name='f4 add') # Function Block 5: adds low lev rew function_block5 = addBlock(name='f5 add') # Function Block 6:ext rew of hi lev ctrl function_block6 = fBlock(phi=G_high, name='f6 G_hi') # Function Block 7: ext rew of low lev ctrl function_block7 = fBlock(phi=G_low(small=small), name='f7 G_lo') #Reward Accumulator H: reward_acc_H = reward_accumulator_block(gamma=mdp_info_agentH.gamma, name='reward_acc_H') #Mux_Block mux_block = MuxBlock(name='mux') mux_block.add_block_list([control_block1]) mux_block.add_block_list([control_block2]) #Algorithm blocks = [ state_ph, reward_ph, lastaction_ph, control_blockH, mux_block, function_block1, function_block2, function_block3, function_block4, function_block5, function_block6, function_block7, reward_acc_H ] #state_ph.add_input(mux_block) #reward_ph.add_input(mux_block) #lastaction_ph.add_input(mux_block) reward_acc_H.add_input(reward_ph) reward_acc_H.add_alarm_connection(control_block1) reward_acc_H.add_alarm_connection(control_block2) control_blockH.add_input(function_block1) control_blockH.add_reward(function_block4) control_blockH.add_alarm_connection(control_block1) control_blockH.add_alarm_connection(control_block2) mux_block.add_input(control_blockH) mux_block.add_input(function_block2) control_block1.add_reward(function_block5) control_block2.add_reward(function_block5) function_block1.add_input(state_ph) function_block2.add_input(control_blockH) function_block2.add_input(state_ph) function_block2.add_input(function_block3) function_block3.add_input(state_ph) function_block3.add_alarm_connection(control_block1) function_block3.add_alarm_connection(control_block2) function_block4.add_input(function_block6) function_block4.add_input(reward_acc_H) function_block5.add_input(reward_ph) function_block5.add_input(function_block7) function_block6.add_input(reward_ph) function_block7.add_input(control_blockH) function_block7.add_input(function_block2) computational_graph = ComputationalGraph(blocks=blocks, model=mdp) core = HierarchicalCore(computational_graph) # Train dataset_eval_visual = list() low_level_dataset_eval1 = list() low_level_dataset_eval2 = list() n_runs = 5 for n in range(n_runs): print('ITERATION', n) core.learn(n_episodes=1000, skip=True) dataset_eval = core.evaluate(n_episodes=10) last_ep_dataset = pick_last_ep(dataset_eval) dataset_eval_visual += last_ep_dataset low_level_dataset_eval1 += control_block1.dataset.get() low_level_dataset_eval2 += control_block2.dataset.get() # Visualize hi_lev_params = agentH.Q.get_weights() hi_lev_params = np.reshape(hi_lev_params, (8, 25)) max_q_val = np.zeros(shape=(25, )) act_max_q_val = np.zeros(shape=(25, )) for i in range(25): max_q_val[i] = np.amax(hi_lev_params[:, i]) act_max_q_val[i] = np.argmax(hi_lev_params[:, i]) max_q_val_tiled = np.reshape(max_q_val, (5, 5)) act_max_q_val_tiled = np.reshape(act_max_q_val, (5, 5)) #low_level_dataset1 = dataset_callback1.get() #low_level_dataset2 = dataset_callback2.get() subdir = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '/' mk_dir_recursive('./' + subdir) np.save(subdir + '/low_level_dataset1_file', low_level_dataset_eval1) np.save(subdir + '/low_level_dataset2_file', low_level_dataset_eval2) np.save(subdir + '/max_q_val_tiled_file', max_q_val_tiled) np.save(subdir + '/act_max_q_val_tiled_file', act_max_q_val_tiled) np.save(subdir + '/dataset_eval_file', dataset_eval_visual) return