def __init__(self, A, B, Q, R, random_init=False, gamma=0.9, horizon=50): """ Constructor. Args: A (np.ndarray): the state dynamics matrix; B (np.ndarray): the action dynamics matrix; Q (np.ndarray): reward weight matrix for state; R (np.ndarray): reward weight matrix for action; random_init (bool, False): start from a random state; gamma (float, 0.9): discount factor; horizon (int, 50): horizon of the mdp. """ self.A = A self.B = B self.Q = Q self.R = R self.random_init = random_init # MDP properties high_x = np.inf * np.ones(A.shape[0]) low_x = -high_x high_u = np.inf * np.ones(B.shape[0]) low_u = -high_u observation_space = spaces.Box(low=low_x, high=high_x) action_space = spaces.Box(low=low_u, high=high_u) mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) super(LQR, self).__init__(mdp_info)
def __init__(self, small=True): self.__name__ = 'ShipSteering' # MDP parameters self.field_size = 150 if small else 1000 low = np.array([0, 0, -np.pi, -np.pi / 12.]) high = np.array([self.field_size, self.field_size, np.pi, np.pi / 12.]) self.omega_max = np.array([np.pi / 12.]) self._v = 3. self._T = 5. self._dt = .2 self._gate_s = np.empty(2) self._gate_e = np.empty(2) self._gate_s[0] = 100 if small else 900 self._gate_s[1] = 120 if small else 920 self._gate_e[0] = 120 if small else 920 self._gate_e[1] = 100 if small else 900 # MDP properties observation_space = spaces.Box(low=low, high=high) action_space = spaces.Box(low=-self.omega_max, high=self.omega_max) horizon = 5000 gamma = .99 mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) super(ShipSteering, self).__init__(mdp_info)
def __init__(self, small=True, hard=False): """ Constructor. Args: small (bool, True): whether to use a small state space or not. hard (bool, False): whether to use -100 as reward for going outside or -10000. With -100 reward the environment is considerably harder. """ self.__name__ = 'ShipSteeringStraight' # MDP parameters self.field_size = 150 low = np.array([0, 0, -np.pi, -np.pi / 12.]) high = np.array([self.field_size, self.field_size, np.pi, np.pi / 12.]) self.omega_max = np.array([np.pi / 12.]) self._v = 3. self._T = 5. self._dt = .2 self.goal_pos = np.array([140, 75]) self._out_reward = -100 self._success_reward = 100 # MDP properties observation_space = spaces.Box(low=low, high=high) action_space = spaces.Box(low=-self.omega_max, high=self.omega_max) horizon = 5000 gamma = .99 mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) super(ShipSteeringStraight, self).__init__(mdp_info)
def __init__(self, n_steps_action=3, viz_speed=100, small=False): self.__name__ = 'ShipSteeringMultiGate' self.n_steps_action = n_steps_action self.viz_speed = viz_speed # MDP parameters self.no_of_gates = 4 self.small = small self.field_size = 500 if small else 1000 low = np.array([0, 0, -np.pi, -np.pi / 12., 0]) high = np.array([ self.field_size, self.field_size, np.pi, np.pi / 12., self.no_of_gates ]) self.omega_max = np.array([np.pi / 12.]) self._v = 3. self._T = 5. self._dt = .2 gate_1s = np.array([75, 175]) if small else np.array([150, 350]) gate_1e = np.array([125, 175]) if small else np.array([250, 350]) gate_1 = np.array([gate_1s, gate_1e]) gate_2s = np.array([150, 300]) if small else np.array([300, 600]) gate_2e = np.array([200, 300]) if small else np.array([400, 600]) gate_2 = np.array([gate_2s, gate_2e]) gate_3s = np.array([250, 350]) if small else np.array([500, 700]) gate_3e = np.array([300, 350]) if small else np.array([600, 700]) gate_3 = np.array([gate_3s, gate_3e]) gate_4s = np.array([150, 425]) if small else np.array([300, 850]) gate_4e = np.array([200, 425]) if small else np.array([400, 850]) gate_4 = np.array([gate_4s, gate_4e]) self._gate_list = gate_1, gate_2, gate_3, gate_4 # MDP properties observation_space = spaces.Box(low=low, high=high) action_space = spaces.Box(low=-self.omega_max, high=self.omega_max) horizon = 5000 gamma = .99 self._out_reward = -10000 self.correct_order = False mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) # Visualization self._viewer = Viewer(self.field_size, self.field_size, background=(66, 131, 237)) super(ShipSteeringMultiGate, self).__init__(mdp_info)
def __init__(self, **kwargs): # Define environment properties high_x = np.array([5.0, 5.0, np.pi]) low_x = -high_x high_u = np.array([1.0, 3.0]) low_u = -high_u observation_space = spaces.Box(low=low_x, high=high_x) action_space = spaces.Box(low=low_u, high=high_u) gamma = 0.9 horizon = 400 mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) hz = 10.0 super(TurtlebotGazebo, self).__init__('turtlebot_gazebo', mdp_info, hz, **kwargs) # subscribe to /cmd_vel topic to publish the setpoint self._pub = rospy.Publisher('/cmd_vel', Twist, queue_size=1) # subscribe to /gazebo/model_states to get the position of the turtlebot model_state_service_name = '/gazebo/get_model_state' rospy.wait_for_service(model_state_service_name) self._model_state_service = rospy.ServiceProxy(model_state_service_name, GetModelState)
def __init__(self, A, B, Q, R, max_pos=np.inf, max_action=np.inf, random_init=False, episodic=False, gamma=0.9, horizon=50): """ Constructor. Args: A (np.ndarray): the state dynamics matrix; B (np.ndarray): the action dynamics matrix; Q (np.ndarray): reward weight matrix for state; R (np.ndarray): reward weight matrix for action; max_pos (float, np.inf): maximum value of the state; max_action (float, np.inf): maximum value of the action; random_init (bool, False): start from a random state; episodic (bool, False): end the episode when the state goes over the threshold; gamma (float, 0.9): discount factor; horizon (int, 50): horizon of the mdp. """ self.A = A self.B = B self.Q = Q self.R = R self._max_pos = max_pos self._max_action = max_action self._episodic = episodic self.random_init = random_init # MDP properties high_x = self._max_pos * np.ones(A.shape[0]) low_x = -high_x high_u = self._max_action * np.ones(B.shape[0]) low_u = -high_u observation_space = spaces.Box(low=low_x, high=high_x) action_space = spaces.Box(low=low_u, high=high_u) mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) super().__init__(mdp_info)
def __init__(self, grid_map_file, height_window=84, width_window=84): self.__name__ = 'GridWorldPixelGenerator' self.window_size = (width_window, height_window) self._symbols = { '.': 0., 'S': 63.75, '*': 127.5, '#': 191.25, 'G': 255. } self._grid, start, goal = self._generate(grid_map_file) self._initial_grid = deepcopy(self._grid) height = self._grid.shape[0] width = self._grid.shape[1] assert height_window % height == 0 and width_window % width == 0 # MDP properties observation_space = spaces.Box(low=0., high=255., shape=(self.window_size[1], self.window_size[0])) action_space = spaces.Discrete(5) horizon = 100 gamma = .9 mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) super(GridWorldPixelGenerator, self).__init__(mdp_info, height, width, start, goal)
def build_low_level_ghavamzadeh(alg, params, mdp): # FeaturesL high = [150, 150, np.pi] low = [0, 0, -np.pi] n_tiles = [5, 5, 10] low = np.array(low, dtype=np.float) high = np.array(high, dtype=np.float) n_tilings = 3 tilingsL = Tiles.generate(n_tilings=n_tilings, n_tiles=n_tiles, low=low, high=high) featuresL = Features(tilings=tilingsL) mdp_info_agentL = MDPInfo(observation_space=spaces.Box( low=np.array([0, 0]), high=np.array([150, 150]), shape=(2, )), action_space=mdp.info.action_space, gamma=0.99, horizon=10000) input_shape = (featuresL.size, ) approximator = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) std = np.array([3e-2]) pi = DiagonalGaussianPolicy(mu=approximator, std=std) agent = alg(pi, mdp_info_agentL, features=featuresL, **params) return agent
def build_discretized_agent(alg, params, n, optim, loss, mdp, eps, n_features, use_cuda): high = mdp.info.observation_space.high low = mdp.info.observation_space.low observation_space = spaces.Box(low=low, high=high) action_space = spaces.Discrete(n) mdp_info = MDPInfo(observation_space=observation_space, action_space=action_space, gamma=mdp.info.gamma, horizon=mdp.info.horizon) pi = Boltzmann(eps) approximator_params = dict(network=Network, optimizer=optim, loss=loss, n_features=n_features, input_shape=mdp_info.observation_space.shape, output_shape=mdp_info.action_space.size, n_actions=mdp_info.action_space.n, use_cuda=use_cuda) agent = alg(PyTorchApproximator, pi, mdp_info, approximator_params=approximator_params, **params) return agent
def __init__(self, random_start=False, m=1., l=1., g=9.8, mu=1e-2, max_u=5., horizon=5000, gamma=.99): """ Constructor. Args: random_start (bool, False): whether to start from a random position or from the horizontal one; m (float, 1.0): mass of the pendulum; l (float, 1.0): length of the pendulum; g (float, 9.8): gravity acceleration constant; mu (float, 1e-2): friction constant of the pendulum; max_u (float, 5.0): maximum allowed input torque; horizon (int, 5000): horizon of the problem; gamma (int, .99): discount factor. """ # MDP parameters self._m = m self._l = l self._g = g self._mu = mu self._random = random_start self._dt = .01 self._max_u = max_u self._max_omega = 5 / 2 * np.pi high = np.array([np.pi, self._max_omega]) # MDP properties observation_space = spaces.Box(low=-high, high=high) action_space = spaces.Box(low=np.array([-max_u]), high=np.array([max_u])) mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) # Visualization self._viewer = Viewer(2.5 * l, 2.5 * l) self._last_u = None super().__init__(mdp_info)
def build_high_level_agent(alg, params, mdp, mu, std): tilings = Tiles.generate(n_tilings=1, n_tiles=[10, 10], low=mdp.info.observation_space.low[:2], high=mdp.info.observation_space.high[:2]) features = Features(tilings=tilings) input_shape = (features.size, ) mu_approximator = Regressor(LinearApproximator, input_shape=input_shape, output_shape=(1, )) std_approximator = Regressor(LinearApproximator, input_shape=input_shape, output_shape=(1, )) w_mu = mu * np.ones(mu_approximator.weights_size) mu_approximator.set_weights(w_mu) w_std = std * np.ones(std_approximator.weights_size) mu_approximator.set_weights(w_std) pi = StateLogStdGaussianPolicy(mu=mu_approximator, log_std=std_approximator) obs_low = np.array( [mdp.info.observation_space.low[0], mdp.info.observation_space.low[1]]) obs_high = np.array([ mdp.info.observation_space.high[0], mdp.info.observation_space.high[1] ]) mdp_info_agent1 = MDPInfo(observation_space=spaces.Box(obs_low, obs_high, shape=(2, )), action_space=spaces.Box( mdp.info.observation_space.low[2], mdp.info.observation_space.high[2], shape=(1, )), gamma=1, horizon=10) agent = alg(policy=pi, mdp_info=mdp_info_agent1, features=features, **params) return agent
def __init__(self, random_start=False, goal_distance=1.0): """ Constructor. Args: random_start: whether to start from a random position or from the horizontal one """ # MDP parameters gamma = 0.99 self.Mr = 0.3 * 2 self.Mp = 2.55 self.Ip = 2.6e-2 self.Ir = 4.54e-4 * 2 self.l = 13.8e-2 self.r = 5.5e-2 self.dt = 1e-2 self.g = 9.81 self.max_u = 5 self._random = random_start self._goal_distance = goal_distance high = np.array([2 * self._goal_distance, np.pi, 15, 75]) # MDP properties observation_space = spaces.Box(low=-high, high=high) action_space = spaces.Box(low=np.array([-self.max_u]), high=np.array([self.max_u])) horizon = 1500 mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) # Visualization env_width = 4 * goal_distance env_height = 2.5 * 2 * self.l width = 800 height = int(width * env_height / env_width) self._viewer = Viewer(env_width, env_height, width, height) super(SegwayLinearMotion, self).__init__(mdp_info)
def __init__(self, small=True, n_steps_action=3): """ Constructor. Args: small (bool, True): whether to use a small state space or not. n_steps_action (int, 3): number of integration intervals for each step of the mdp. """ # MDP parameters self.field_size = 150 if small else 1000 low = np.array([0, 0, -np.pi, -np.pi / 12.]) high = np.array([self.field_size, self.field_size, np.pi, np.pi / 12.]) self.omega_max = np.array([np.pi / 12.]) self._v = 3. self._T = 5. self._dt = .2 self._gate_s = np.empty(2) self._gate_e = np.empty(2) self._gate_s[0] = 100 if small else 350 self._gate_s[1] = 120 if small else 400 self._gate_e[0] = 120 if small else 450 self._gate_e[1] = 100 if small else 400 self._out_reward = -100 self._success_reward = 0 self._small = small self._state = None self.n_steps_action = n_steps_action # MDP properties observation_space = spaces.Box(low=low, high=high) action_space = spaces.Box(low=-self.omega_max, high=self.omega_max) horizon = 5000 gamma = .99 mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) # Visualization self._viewer = Viewer(self.field_size, self.field_size, background=(66, 131, 237)) super(ShipSteering, self).__init__(mdp_info)
def build_agent_high(alg, params, std, mdp): # Features approximator1 = Regressor(LinearApproximator, input_shape=(1, ), output_shape=(1, )) # Policy H n_weights = approximator1.weights_size mu = np.zeros(n_weights) sigma = std * np.ones(n_weights) pi = DeterministicPolicy(approximator1) dist = GaussianDiagonalDistribution(mu, sigma) lim = np.pi / 2 low = mdp.info.observation_space.low[0:1] high = mdp.info.observation_space.high[0:1] mdp_info = MDPInfo(observation_space=spaces.Box(low, high), action_space=spaces.Box(-lim, lim, (1, )), gamma=mdp.info.gamma, horizon=mdp.info.horizon) return alg(dist, pi, mdp_info, **params)
def __init__(self, random_start=False, m=1.0, l=1.0, g=9.8, mu=1e-2, max_u=2.0): """ Constructor. Args: random_start: whether to start from a random position or from the horizontal one m (float, 1.0): Mass of the pendulum l (float, 1.0): Length of the pendulum g (float, 9.8): gravity acceleration constant mu (float, 1e-2): friction constant of the pendulum max_u (float, 2.0): maximum allowed input torque """ # MDP parameters self._g = g self._m = m self._l = l self._mu = mu self._random = random_start self._dt = 0.02 self._max_u = max_u self._max_omega = 78.54 high = np.array([np.pi, self._max_omega]) # MDP properties observation_space = spaces.Box(low=-high, high=high) action_space = spaces.Box(low=np.array([-max_u]), high=np.array([max_u])) horizon = 5000 gamma = .99 mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) super(InvertedPendulum, self).__init__(mdp_info)
def __init__(self, random_start=False): """ Constructor. Args: random_start: whether to start from a random position or from the horizontal one """ # MDP parameters gamma = 0.97 self._Mr = 0.3 * 2 self._Mp = 2.55 self._Ip = 2.6e-2 self._Ir = 4.54e-4 * 2 self._l = 13.8e-2 self._r = 5.5e-2 self._dt = 1e-2 self._g = 9.81 self._max_u = 5 self._random = random_start high = np.array([-np.pi / 2, 15, 75]) # MDP properties observation_space = spaces.Box(low=-high, high=high) action_space = spaces.Box(low=np.array([-self._max_u]), high=np.array([self._max_u])) horizon = 300 mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) # Visualization self._viewer = Viewer(5 * self._l, 5 * self._l) self._last_x = 0 super(Segway, self).__init__(mdp_info)
def build_mid_level_agent(alg, params, mdp, mu, std): mu_approximator = Regressor(LinearApproximator, input_shape=(1, ), output_shape=(2, )) w_mu = mu * np.ones(mu_approximator.weights_size) mu_approximator.set_weights(w_mu) pi = DiagonalGaussianPolicy(mu=mu_approximator, std=std * np.ones(2)) lim = mdp.info.observation_space.high[0] basis = PolynomialBasis() features = BasisFeatures(basis=[basis]) mdp_info_agent1 = MDPInfo(observation_space=spaces.Box(0, 1, (1, )), action_space=spaces.Box(0, lim, (2, )), gamma=1, horizon=10) agent = alg(policy=pi, mdp_info=mdp_info_agent1, features=features, **params) return agent
def __init__(self, m=2., M=8., l=.5, g=9.8, mu=1e-2, max_u=50., noise_u=10., horizon=3000, gamma=.95): """ Constructor. Args: m (float, 2.0): mass of the pendulum; M (float, 8.0): mass of the cart; l (float, .5): length of the pendulum; g (float, 9.8): gravity acceleration constant; mu (float, 1e-2): friction constant of the pendulum; max_u (float, 50.): maximum allowed input torque; noise_u (float, 10.): maximum noise on the action; horizon (int, 3000): horizon of the problem; gamma (int, .95): discount factor. """ # MDP parameters self._m = m self._M = M self._l = l self._g = g self._alpha = 1 / (self._m + self._M) self._mu = mu self._dt = .1 self._max_u = max_u self._noise_u = noise_u high = np.array([np.inf, np.inf]) # MDP properties observation_space = spaces.Box(low=-high, high=high) action_space = spaces.Discrete(3) mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) # Visualization self._viewer = Viewer(2.5 * l, 2.5 * l) self._last_u = None self._state = None super().__init__(mdp_info)
def build_high_level_agent(alg, params, mdp, mu, sigma): features = Features(basis_list=[PolynomialBasis()]) approximator = Regressor(LinearApproximator, input_shape=(features.size, ), output_shape=(2, )) approximator.set_weights(mu) pi1 = DiagonalGaussianPolicy(mu=approximator, std=sigma) lim = mdp.info.observation_space.high[0] mdp_info_agent = MDPInfo(observation_space=mdp.info.observation_space, action_space=spaces.Box(0, lim, (2, )), gamma=1.0, horizon=100) agent = alg(pi1, mdp_info_agent, features=features, **params) return agent
def build_low_level_agent(alg, params, mdp): features = Features( basis_list=[PolynomialBasis(dimensions=[0], degrees=[1])]) pi = DeterministicControlPolicy(weights=np.array([0])) mu = np.zeros(pi.weights_size) sigma = 1e-3 * np.ones(pi.weights_size) distribution = GaussianDiagonalDistribution(mu, sigma) mdp_info_agent2 = MDPInfo(observation_space=spaces.Box( -np.pi, np.pi, (1, )), action_space=mdp.info.action_space, gamma=mdp.info.gamma, horizon=100) agent = alg(distribution, pi, mdp_info_agent2, features=features, **params) return agent
def __init__(self, horizon=100, gamma=.95): """ Constructor. """ # MDP parameters self.max_pos = 1. self.max_velocity = 3. high = np.array([self.max_pos, self.max_velocity]) self._g = 9.81 self._m = 1. self._dt = .1 self._discrete_actions = [-4., 4.] # MDP properties observation_space = spaces.Box(low=-high, high=high) action_space = spaces.Discrete(2) mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) super().__init__(mdp_info)
def __init__(self): self.__name__ = 'CarOnHill' # MDP parameters self.max_pos = 1. self.max_velocity = 3. high = np.array([self.max_pos, self.max_velocity]) self._g = 9.81 self._m = 1 self._dt = .1 self._discrete_actions = [-4., 4.] # MDP properties observation_space = spaces.Box(low=-high, high=high) action_space = spaces.Discrete(2) horizon = 100 gamma = .95 mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) super(CarOnHill, self).__init__(mdp_info)
def build_agent_low(alg, params, std, mdp): approximator = Regressor(LinearApproximator, input_shape=(3, ), output_shape=(1, )) n_weights = approximator.weights_size mu = np.zeros(n_weights) sigma = std * np.ones(n_weights) pi = DeterministicControlPolicy(approximator) dist = GaussianDiagonalDistribution(mu, sigma) # Agent Low mdp_info = MDPInfo( observation_space=spaces.Box( low=mdp.info.observation_space.low[1:], # FIXME FALSE high=mdp.info.observation_space.high[1:], # FIXME FALSE ), action_space=mdp.info.action_space, gamma=mdp.info.gamma, horizon=mdp.info.horizon) return alg(dist, pi, mdp_info, **params)
def build_low_level_agent(alg, params, mdp, horizon, std): rho_max = np.linalg.norm(mdp.info.observation_space.high[:2] - mdp.info.observation_space.low[:2]) low = np.array([-np.pi, 0]) high = np.array([np.pi, rho_max]) basis = FourierBasis.generate(low, high, 10) features = Features(basis_list=basis) approximator = Regressor(LinearApproximator, input_shape=(features.size, ), output_shape=mdp.info.action_space.shape) pi = DiagonalGaussianPolicy(approximator, std) mdp_info_agent = MDPInfo(observation_space=spaces.Box(low, high), action_space=mdp.info.action_space, gamma=mdp.info.gamma, horizon=horizon) agent = alg(pi, mdp_info_agent, features=features, **params) return agent
def build_high_level_agent(alg, params, optim, loss, mdp, horizon_low, eps, n_features, use_cuda): high = np.ones(4) low = np.zeros(4) high[:2] = mdp.info.observation_space.high[:2] low[:2] = mdp.info.observation_space.low[:2] high[2:] = mdp.info.observation_space.high[3:5] low[2:] = mdp.info.observation_space.low[3:5] n_actions = 9 observation_space = spaces.Box(low=low, high=high) action_space = spaces.Discrete(n_actions) mdp_info = MDPInfo(observation_space=observation_space, action_space=action_space, gamma=mdp.info.gamma**horizon_low, horizon=mdp.info.horizon) pi = Boltzmann(eps) approximator_params = dict(network=Network, optimizer=optim, loss=loss, n_features=n_features, input_shape=mdp_info.observation_space.shape, output_shape=mdp_info.action_space.size, n_actions=mdp_info.action_space.n, use_cuda=use_cuda) agent = alg(PyTorchApproximator, pi, mdp_info, approximator_params=approximator_params, **params) return agent
def __init__(self): self.__name__ = 'InvertedPendulum' # MDP parameters self.max_degree = np.inf self.max_angular_velocity = np.inf high = np.array([self.max_degree, self.max_angular_velocity]) self._g = 9.8 self._m = 2. self._M = 8. self._l = .5 self._alpha = 1. / (self._m + self._M) self._dt = .1 self._discrete_actions = [-50., 0., 50.] # MDP properties observation_space = spaces.Box(low=-high, high=high) action_space = spaces.Discrete(3) horizon = 3000 gamma = .95 mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) super(InvertedPendulum, self).__init__(mdp_info)
def experiment(): np.random.seed() # Model Block mdp = ShipSteeringMultiGate() #State Placeholder state_ph = PlaceHolder(name='state_ph') #Reward Placeholder reward_ph = PlaceHolder(name='reward_ph') # Function Block 1 function_block1 = fBlock(name='f1 (angle difference)', phi=phi) # Function Block 2 function_block2 = squarednormBlock(name='f2 (squared norm)') # Function Block 3 function_block3 = addBlock(name='f3 (summation)') #Features features = Features(basis_list=[PolynomialBasis()]) # Policy 1 sigma1 = np.array([38, 38]) approximator1 = Regressor(LinearApproximator, input_shape=(features.size, ), output_shape=(2, )) approximator1.set_weights(np.array([75, 75])) pi1 = DiagonalGaussianPolicy(mu=approximator1, sigma=sigma1) # Policy 2 sigma2 = Parameter(value=.01) approximator2 = Regressor(LinearApproximator, input_shape=(1, ), output_shape=mdp.info.action_space.shape) pi2 = GaussianPolicy(mu=approximator2, sigma=sigma2) # Agent 1 learning_rate = AdaptiveParameter(value=10) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } mdp_info_agent1 = MDPInfo(observation_space=mdp.info.observation_space, action_space=spaces.Box(0, 150, (2, )), gamma=mdp.info.gamma, horizon=50) agent1 = GPOMDP(policy=pi1, mdp_info=mdp_info_agent1, params=agent_params, features=features) # Agent 2 learning_rate = AdaptiveParameter(value=.001) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } mdp_info_agent2 = MDPInfo(observation_space=spaces.Box( -np.pi, np.pi, (1, )), action_space=mdp.info.action_space, gamma=mdp.info.gamma, horizon=100) agent2 = GPOMDP(policy=pi2, mdp_info=mdp_info_agent2, params=agent_params, features=None) # Control Block 1 parameter_callback1 = CollectPolicyParameter(pi1) control_block1 = ControlBlock(name='Control Block 1', agent=agent1, n_eps_per_fit=5, callbacks=[parameter_callback1]) # Control Block 2 dataset_callback = CollectDataset() parameter_callback2 = CollectPolicyParameter(pi2) control_block2 = ControlBlock( name='Control Block 2', agent=agent2, n_eps_per_fit=10, callbacks=[dataset_callback, parameter_callback2]) #Reward Accumulator reward_acc = reward_accumulator_block(gamma=mdp_info_agent1.gamma, name='reward_acc') # Algorithm blocks = [ state_ph, reward_ph, control_block1, control_block2, function_block1, function_block2, function_block3, reward_acc ] #order = [0, 1, 7, 2, 4, 5, 6, 3] state_ph.add_input(control_block2) reward_ph.add_input(control_block2) control_block1.add_input(state_ph) reward_acc.add_input(reward_ph) reward_acc.add_alarm_connection(control_block2) control_block1.add_reward(reward_acc) control_block1.add_alarm_connection(control_block2) function_block1.add_input(control_block1) function_block1.add_input(state_ph) function_block2.add_input(function_block1) function_block3.add_input(function_block2) function_block3.add_input(reward_ph) control_block2.add_input(function_block1) control_block2.add_reward(function_block3) computational_graph = ComputationalGraph(blocks=blocks, model=mdp) core = HierarchicalCore(computational_graph) # Train #dataset_learn_visual = core.learn(n_episodes=2000) dataset_learn_visual = list() for n in range(4): dataset_learn = core.learn(n_episodes=500) last_ep_dataset = pick_last_ep(dataset_learn) dataset_learn_visual += last_ep_dataset del dataset_learn # Evaluate dataset_eval = core.evaluate(n_episodes=10) # Visualize low_level_dataset = dataset_callback.get() parameter_dataset1 = parameter_callback1.get_values() parameter_dataset2 = parameter_callback2.get_values() visualize_policy_params(parameter_dataset1, parameter_dataset2) visualize_control_block(low_level_dataset, ep_count=20) visualize_ship_steering(dataset_learn_visual, name='learn', n_gates=4) visualize_ship_steering(dataset_eval, 'evaluate', n_gates=4) plt.show() return
def experiment_ghavamzade(alg_high, alg_low, params, subdir, i): np.random.seed() # Model Block mdp = ShipSteering(small=False, n_steps_action=3) #State Placeholder state_ph = PlaceHolder(name='state_ph') #Reward Placeholder reward_ph = PlaceHolder(name='reward_ph') #Last action Placeholder lastaction_ph = PlaceHolder(name='lastaction_ph') # FeaturesH low_hi = 0 lim_hi = 1000 + 1e-8 n_tiles_high = [20, 20] n_tilings = 1 # Discretization Block discretization_block = DiscretizationBlock(low=low_hi, high=lim_hi, n_tiles=n_tiles_high) # PolicyH epsilon = Parameter(value=0.1) piH = EpsGreedy(epsilon=epsilon) # AgentH learning_rate = params.get('learning_rate_high') mdp_info_agentH = MDPInfo(observation_space=spaces.Discrete( n_tiles_high[0] * n_tiles_high[1]), action_space=spaces.Discrete(8), gamma=1, horizon=10000) agentH = alg_high(policy=piH, mdp_info=mdp_info_agentH, learning_rate=learning_rate, lambda_coeff=0.9) epsilon_update = EpsilonUpdate(piH) # Control Block H control_blockH = ControlBlock(name='control block H', agent=agentH, n_steps_per_fit=1) #FeaturesL high = [150, 150, np.pi] low = [0, 0, -np.pi] n_tiles = [5, 5, 10] low = np.array(low, dtype=np.float) high = np.array(high, dtype=np.float) n_tilings = 3 tilingsL = Tiles.generate(n_tilings=n_tilings, n_tiles=n_tiles, low=low, high=high) featuresL = Features(tilings=tilingsL) mdp_info_agentL = MDPInfo(observation_space=spaces.Box( low=np.array([0, 0]), high=np.array([150, 150]), shape=(2, )), action_space=mdp.info.action_space, gamma=0.99, horizon=10000) # Approximators input_shape = (featuresL.size, ) approximator_params = dict(input_dim=input_shape[0]) approximator1 = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape, **approximator_params) approximator2 = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape, **approximator_params) # Policy1 std1 = np.array([3e-2]) pi1 = DiagonalGaussianPolicy(mu=approximator1, std=std1) # Policy2 std2 = np.array([3e-2]) pi2 = DiagonalGaussianPolicy(mu=approximator2, std=std2) # Agent1 learning_rate1 = params.get('learning_rate_low') agent1 = alg_low(pi1, mdp_info_agentL, learning_rate1, featuresL) # Agent2 learning_rate2 = params.get('learning_rate_low') agent2 = alg_low(pi2, mdp_info_agentL, learning_rate2, featuresL) #Termination Conds termination_condition1 = TerminationCondition(active_dir='+') termination_condition2 = TerminationCondition(active_dir='x') low_ep_per_fit = params.get('low_ep_per_fit') # Control Block + control_block_plus = ControlBlock( name='control block 1', agent=agent1, n_eps_per_fit=low_ep_per_fit, termination_condition=termination_condition1) # Control Block x control_block_cross = ControlBlock( name='control block 2', agent=agent2, n_eps_per_fit=low_ep_per_fit, termination_condition=termination_condition2) # Function Block 1: picks state for hi lev ctrl function_block1 = fBlock(phi=pick_state, name='f1 pickstate') # Function Block 2: maps the env to low lev ctrl state function_block2 = fBlock(phi=rototranslate, name='f2 rotot') # Function Block 3: holds curr state as ref function_block3 = hold_state(name='f3 holdstate') # Function Block 4: adds hi lev rew function_block4 = addBlock(name='f4 add') # Function Block 5: adds low lev rew function_block5 = addBlock(name='f5 add') # Function Block 6:ext rew of hi lev ctrl function_block6 = fBlock(phi=G_high, name='f6 G_hi') # Function Block 7: ext rew of low lev ctrl function_block7 = fBlock(phi=G_low, name='f7 G_lo') #Reward Accumulator H: reward_acc_H = reward_accumulator_block(gamma=mdp_info_agentH.gamma, name='reward_acc_H') # Selector Block function_block8 = fBlock(phi=selector_function, name='f7 G_lo') #Mux_Block mux_block = MuxBlock(name='mux') mux_block.add_block_list([control_block_plus]) mux_block.add_block_list([control_block_cross]) #Algorithm blocks = [ state_ph, reward_ph, lastaction_ph, control_blockH, mux_block, function_block1, function_block2, function_block3, function_block4, function_block5, function_block6, function_block7, function_block8, reward_acc_H, discretization_block ] reward_acc_H.add_input(reward_ph) reward_acc_H.add_alarm_connection(control_block_plus) reward_acc_H.add_alarm_connection(control_block_cross) control_blockH.add_input(discretization_block) control_blockH.add_reward(function_block4) control_blockH.add_alarm_connection(control_block_plus) control_blockH.add_alarm_connection(control_block_cross) mux_block.add_input(function_block8) mux_block.add_input(function_block2) control_block_plus.add_reward(function_block5) control_block_cross.add_reward(function_block5) function_block1.add_input(state_ph) function_block2.add_input(control_blockH) function_block2.add_input(state_ph) function_block2.add_input(function_block3) function_block3.add_input(state_ph) function_block3.add_alarm_connection(control_block_plus) function_block3.add_alarm_connection(control_block_cross) function_block4.add_input(function_block6) function_block4.add_input(reward_acc_H) function_block5.add_input(function_block7) function_block6.add_input(reward_ph) function_block7.add_input(control_blockH) function_block7.add_input(function_block2) function_block8.add_input(control_blockH) discretization_block.add_input(function_block1) computational_graph = ComputationalGraph(blocks=blocks, model=mdp) core = HierarchicalCore(computational_graph) # Train low_level_dataset_eval1 = list() low_level_dataset_eval2 = list() dataset_eval = list() dataset_eval_run = core.evaluate(n_episodes=ep_per_run) # print('distribution parameters: ', distribution.get_parameters()) J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) dataset_eval += dataset_eval_run print('J at start : ' + str(np.mean(J))) for n in range(n_runs): print('ITERATION', n) core.learn(n_episodes=n_iterations * ep_per_run, skip=True) dataset_eval_run = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) print('J at iteration ' + str(n) + ': ' + str(np.mean(J))) dataset_eval += dataset_eval_run dataset_plus = control_block_plus.dataset.get() J_plus = compute_J(dataset_plus, mdp.info.gamma) dataset_cross = control_block_cross.dataset.get() J_cross = compute_J(dataset_cross, mdp.info.gamma) low_level_dataset_eval1.append(dataset_plus) low_level_dataset_eval2.append(dataset_cross) print('J ll PLUS at iteration ' + str(n) + ': ' + str(np.mean(J_plus))) print('J ll CROSS at iteration ' + str(n) + ': ' + str(np.mean(J_cross))) if n == 4: control_blockH.callbacks = [epsilon_update] # Tile data hi_lev_params = agentH.Q.table max_q_val = np.zeros(n_tiles_high[0]**2) act_max_q_val = np.zeros(n_tiles_high[0]**2) for n in range(n_tiles_high[0]**2): max_q_val[n] = np.amax(hi_lev_params[n]) act_max_q_val[n] = np.argmax(hi_lev_params[n]) mk_dir_recursive('./' + subdir + str(i)) np.save(subdir + str(i) + '/low_level_dataset1_file', low_level_dataset_eval1) np.save(subdir + str(i) + '/low_level_dataset2_file', low_level_dataset_eval2) np.save(subdir + str(i) + '/max_q_val_tiled_file', max_q_val) np.save(subdir + str(i) + '/act_max_q_val_tiled_file', act_max_q_val) np.save(subdir + str(i) + '/dataset_eval_file', dataset_eval) return
def server_experiment_small(alg_high, alg_low, params, subdir, i): np.random.seed() # Model Block mdp = ShipSteering(small=False, n_steps_action=3) #State Placeholder state_ph = PlaceHolder(name='state_ph') #Reward Placeholder reward_ph = PlaceHolder(name='reward_ph') #Last_In Placeholder lastaction_ph = PlaceHolder(name='lastaction_ph') # Function Block 1 function_block1 = fBlock(name='f1 (angle difference)', phi=pos_ref_angle_difference) # Function Block 2 function_block2 = fBlock(name='f2 (cost cosine)', phi=cost_cosine) #Features features = Features(basis_list=[PolynomialBasis()]) # Policy 1 sigma1 = np.array([255, 255]) approximator1 = Regressor(LinearApproximator, input_shape=(features.size, ), output_shape=(2, )) approximator1.set_weights(np.array([500, 500])) pi1 = DiagonalGaussianPolicy(mu=approximator1, std=sigma1) # Policy 2 pi2 = DeterministicControlPolicy(weights=np.array([0])) mu2 = np.zeros(pi2.weights_size) sigma2 = 1e-3 * np.ones(pi2.weights_size) distribution2 = GaussianDiagonalDistribution(mu2, sigma2) # Agent 1 learning_rate1 = params.get('learning_rate_high') lim = 1000 mdp_info_agent1 = MDPInfo(observation_space=mdp.info.observation_space, action_space=spaces.Box(0, lim, (2, )), gamma=mdp.info.gamma, horizon=100) agent1 = alg_high(policy=pi1, mdp_info=mdp_info_agent1, learning_rate=learning_rate1, features=features) # Agent 2 learning_rate2 = params.get('learning_rate_low') mdp_info_agent2 = MDPInfo(observation_space=spaces.Box( -np.pi, np.pi, (1, )), action_space=mdp.info.action_space, gamma=mdp.info.gamma, horizon=100) agent2 = alg_low(distribution=distribution2, policy=pi2, mdp_info=mdp_info_agent2, learning_rate=learning_rate2) # Control Block 1 parameter_callback1 = CollectPolicyParameter(pi1) control_block1 = ControlBlock(name='Control Block 1', agent=agent1, n_eps_per_fit=ep_per_run, callbacks=[parameter_callback1]) # Control Block 2 parameter_callback2 = CollectDistributionParameter(distribution2) control_block2 = ControlBlock(name='Control Block 2', agent=agent2, n_eps_per_fit=10, callbacks=[parameter_callback2]) #Reward Accumulator reward_acc = reward_accumulator_block(gamma=mdp_info_agent1.gamma, name='reward_acc') # Algorithm blocks = [ state_ph, reward_ph, lastaction_ph, control_block1, control_block2, function_block1, function_block2, reward_acc ] state_ph.add_input(control_block2) reward_ph.add_input(control_block2) lastaction_ph.add_input(control_block2) control_block1.add_input(state_ph) reward_acc.add_input(reward_ph) reward_acc.add_alarm_connection(control_block2) control_block1.add_reward(reward_acc) control_block1.add_alarm_connection(control_block2) function_block1.add_input(control_block1) function_block1.add_input(state_ph) function_block2.add_input(function_block1) control_block2.add_input(function_block1) control_block2.add_reward(function_block2) computational_graph = ComputationalGraph(blocks=blocks, model=mdp) core = HierarchicalCore(computational_graph) # Train low_level_dataset_eval = list() dataset_eval = list() dataset_eval_run = core.evaluate(n_episodes=eval_run) J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) dataset_eval += dataset_eval_run for n in range(n_runs): print('ITERATION', n) core.learn(n_episodes=n_iterations * ep_per_run, skip=True) dataset_eval_run = core.evaluate(n_episodes=eval_run) dataset_eval += dataset_eval_run J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) print('J at iteration ' + str(n) + ': ' + str(np.mean(J))) low_level_dataset_eval += control_block2.dataset.get() # Save parameter_dataset1 = parameter_callback1.get_values() parameter_dataset2 = parameter_callback2.get_values() mk_dir_recursive('./' + subdir + str(i)) np.save(subdir + str(i) + '/low_level_dataset_file', low_level_dataset_eval) np.save(subdir + str(i) + '/parameter_dataset1_file', parameter_dataset1) np.save(subdir + str(i) + '/parameter_dataset2_file', parameter_dataset2) np.save(subdir + str(i) + '/dataset_eval_file', dataset_eval)
def segway_experiment(alg_high, alg_low, params_high, params_low): np.random.seed() # Model Block mdp = SegwayLinearMotion(goal_distance=1.0) #State Placeholder state_ph = PlaceHolder(name='state_ph') #Reward Placeholder reward_ph = PlaceHolder(name='reward_ph') #Last_In Placeholder lastaction_ph = PlaceHolder(name='lastaction_ph') # Function Block 1 function_block1 = fBlock(name='f1 (pick distance to goal state var)', phi=pick_first_state) # Function Block 2 function_block2 = fBlock(name='f2 (build state)', phi=angle_to_angle_diff_complete_state) # Function Block 3 function_block3 = fBlock(name='f3 (reward low level)', phi=lqr_cost_segway) # Function Block 4 function_block4 = addBlock(name='f4 (add block)') # Function Block 5 function_block5 = fBlock(name='f5 (fall punish low level)', phi=fall_reward) # Features approximator1 = Regressor(LinearApproximator, input_shape=(1,), output_shape=(1,)) # Policy H n_weights = approximator1.weights_size mu1 = np.zeros(n_weights) sigma1 = 2.0e-2*np.ones(n_weights) pi1 = DeterministicPolicy(approximator1) dist1 = GaussianDiagonalDistribution(mu1, sigma1) # Agent H lim = np.pi/2 mdp_info_agent1 = MDPInfo(observation_space=mdp.info.observation_space, action_space=spaces.Box(-lim, lim, (1,)), gamma=mdp.info.gamma, horizon=mdp.info.horizon) agent_high = alg_high(dist1, pi1, mdp_info_agent1, **params_high) # Policy L approximator2 = Regressor(LinearApproximator, input_shape=(3,), output_shape=(1,)) n_weights2 = approximator2.weights_size mu2 = np.zeros(n_weights2) sigma2 = 2.0*np.ones(n_weights2) pi2 = DeterministicControlPolicy(approximator2) dist2 = GaussianDiagonalDistribution(mu2, sigma2) # Agent Low mdp_info_agent2 = MDPInfo(observation_space=spaces.Box( low=mdp.info.observation_space.low[1:], #FIXME FALSE high=mdp.info.observation_space.high[1:], #FIXME FALSE shape=(3,)), action_space=mdp.info.action_space, gamma=mdp.info.gamma, horizon=mdp.info.horizon) agent_low = alg_low(dist2, pi2, mdp_info_agent2, **params_low) # Control Block 1 parameter_callback1 = CollectDistributionParameter(dist1) control_block1 = ControlBlock(name='Control Block High', agent=agent_high, n_eps_per_fit=n_ep_per_fit*2, callbacks=[parameter_callback1]) # Control Block 2 parameter_callback2 = CollectDistributionParameter(dist2) control_block2 = ControlBlock(name='Control Block Low', agent=agent_low, n_eps_per_fit=n_ep_per_fit, callbacks=[parameter_callback2]) control_block1.set_mask() # Algorithm blocks = [state_ph, reward_ph, lastaction_ph, control_block1, control_block2, function_block1, function_block2, function_block3, function_block4, function_block5] state_ph.add_input(control_block2) reward_ph.add_input(control_block2) lastaction_ph.add_input(control_block2) control_block1.add_input(function_block1) control_block1.add_reward(reward_ph) control_block2.add_input(function_block2) control_block2.add_reward(function_block4) function_block1.add_input(state_ph) function_block2.add_input(control_block1) function_block2.add_input(state_ph) function_block3.add_input(function_block2) function_block5.add_input(state_ph) function_block4.add_input(function_block3) function_block4.add_input(function_block5) computational_graph = ComputationalGraph(blocks=blocks, model=mdp) core = HierarchicalCore(computational_graph) # Train dataset_eval_run = core.evaluate(n_episodes=eval_run, render=False) J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) mask_done = False for n in range(n_epochs): print('ITERATION', n) if n == 2: control_block1.unset_mask() core.learn(n_episodes=n_iterations*n_ep_per_fit, skip=True) dataset_eval_run = core.evaluate(n_episodes=eval_run, render=False) J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) print('J at iteration ' + str(n) + ': ' + str(np.mean(J))) print('dist H:', dist1.get_parameters()) print('dist L mu:', dist2.get_parameters()[:3]) print('dist L sigma:', dist2.get_parameters()[3:])