def __init__(self, simulator, dt=0.5, Lambda=0.75, alpha_v=0.1, alpha_u=0.1, num_features=2**20, tile_weight_exponent=0.5, trunc_normal=True, subspaces=[1, 2, 6]): self.simulator = simulator self.dt = max(dt, self.simulator.dt) self.tile_coder = HashingTileCoder( self.make_tile_coder(tile_weight_exponent, subspaces), num_features) initial_thrust_sigma = simulator.max_thrust / 10 initial_thrust_mu = 0.5 initial_rcs_sigma = simulator.max_rcs / 6 initial_rcs_mu = 0.0 self.critic = Critic(self.tile_coder, Lambda, alpha_v, initial_value=1.0) self.thrust_actor = PolicyGradientActor( self.tile_coder, Lambda, alpha_u, min_action=0.0, max_action=simulator.max_thrust, min_sigma=simulator.max_thrust / 64, max_sigma=simulator.max_thrust / 2, initial_mu=initial_thrust_mu, initial_sigma=initial_thrust_sigma, trunc_normal=trunc_normal) self.rcs_actor = PolicyGradientActor(self.tile_coder, Lambda, alpha_u, min_action=-simulator.max_rcs, max_action=simulator.max_rcs, min_sigma=simulator.max_rcs / 32, max_sigma=simulator.max_rcs, initial_mu=initial_rcs_mu, initial_sigma=initial_rcs_sigma, trunc_normal=trunc_normal)
def __init__(self, simulator, dt=0.5, Lambda=0.75, alpha_v=0.1, alpha_u=0.1, num_features=2**20, tile_weight_exponent=0.5, trunc_normal=True, subspaces=[1,2,6]): self.simulator = simulator self.dt = max(dt, self.simulator.dt) self.tile_coder = HashingTileCoder (self.make_tile_coder(tile_weight_exponent, subspaces), num_features) initial_thrust_sigma = simulator.max_thrust / 10 initial_thrust_mu = 0.5 initial_rcs_sigma = simulator.max_rcs / 6 initial_rcs_mu = 0.0 self.critic = Critic (self.tile_coder, Lambda, alpha_v, initial_value=1.0) self.thrust_actor = PolicyGradientActor (self.tile_coder, Lambda, alpha_u, min_action=0.0, max_action=simulator.max_thrust, min_sigma=simulator.max_thrust/64, max_sigma=simulator.max_thrust/2, initial_mu=initial_thrust_mu, initial_sigma=initial_thrust_sigma, trunc_normal=trunc_normal) self.rcs_actor = PolicyGradientActor (self.tile_coder, Lambda, alpha_u, min_action=-simulator.max_rcs, max_action=simulator.max_rcs, min_sigma=simulator.max_rcs/32, max_sigma=simulator.max_rcs, initial_mu=initial_rcs_mu, initial_sigma=initial_rcs_sigma, trunc_normal=trunc_normal)
class PolicyGradientAgent: def __init__(self, simulator, dt=0.5, Lambda=0.75, alpha_v=0.1, alpha_u=0.1, num_features=2**20, tile_weight_exponent=0.5, trunc_normal=True, subspaces=[1,2,6]): self.simulator = simulator self.dt = max(dt, self.simulator.dt) self.tile_coder = HashingTileCoder (self.make_tile_coder(tile_weight_exponent, subspaces), num_features) initial_thrust_sigma = simulator.max_thrust / 10 initial_thrust_mu = 0.5 initial_rcs_sigma = simulator.max_rcs / 6 initial_rcs_mu = 0.0 self.critic = Critic (self.tile_coder, Lambda, alpha_v, initial_value=1.0) self.thrust_actor = PolicyGradientActor (self.tile_coder, Lambda, alpha_u, min_action=0.0, max_action=simulator.max_thrust, min_sigma=simulator.max_thrust/64, max_sigma=simulator.max_thrust/2, initial_mu=initial_thrust_mu, initial_sigma=initial_thrust_sigma, trunc_normal=trunc_normal) self.rcs_actor = PolicyGradientActor (self.tile_coder, Lambda, alpha_u, min_action=-simulator.max_rcs, max_action=simulator.max_rcs, min_sigma=simulator.max_rcs/32, max_sigma=simulator.max_rcs, initial_mu=initial_rcs_mu, initial_sigma=initial_rcs_sigma, trunc_normal=trunc_normal) def make_tile_coder (self, tile_weight_exponent, subspaces): # xpos ypos xvel yvel rot rotvel state_signed = np.array ([ False, False, True, True, True, True ]) state_bounded = np.array ([ True, True, True, True, False, True ]) tile_size = np.array ([ 5., 5., 2., 2., math.pi/2, math.pi/6 ]) num_tiles = np.array ([ 6, 4, 4, 4, 2, 3 ]) num_offsets = np.array ([ 2, 2, 4, 4, 8, 4 ]) self.max_state = (tile_size * num_tiles) - 1e-8 self.min_state = -self.max_state self.min_state[np.logical_not(state_signed)] = 0.0 self.max_clip_state = self.max_state.copy() self.max_clip_state[np.logical_not(state_bounded)] = float('inf') self.min_clip_state = -self.max_clip_state self.min_clip_state[np.logical_not(state_signed)] = 0.0 num_tiles[state_signed] *= 2 num_tiles[state_bounded] += 1 return TileCoder (tile_size, num_tiles, num_offsets, subspaces, tile_weight_exponent) def compute_action (self, features): # def clamp (value, low, high): # value = low + math.fmod (abs(value-low), 2*(high-low)) # if value > high: value = 2*high - value # return value # thrust = clamp (self.thrust_actor.act(features), 0.0, self.simulator.max_thrust) # rcs = clamp (self.rcs_actor.act(features), -self.simulator.max_rcs, self.simulator.max_rcs) thrust = self.thrust_actor.act(features) rcs = self.rcs_actor.act(features) return (thrust, rcs) def initialize (self, state): features = self.tile_coder.indices (state.clip (self.min_clip_state, self.max_clip_state)) self.critic.initialize(features) self.thrust_actor.initialize() self.rcs_actor.initialize() return self.compute_action (features) def update (self, state, reward, terminal=False, learn=True): features = self.tile_coder.indices (state.clip (self.min_clip_state, self.max_clip_state)) if learn: td_error = self.critic.evaluate (features, reward, terminal) self.thrust_actor.learn (td_error) self.rcs_actor.learn (td_error) return self.compute_action (features) def get_state(self): return np.vstack ((self.critic.value.weights, self.thrust_actor.mu.weights, self.thrust_actor.sigma.weights, self.rcs_actor.mu.weights, self.rcs_actor.sigma.weights)) def set_state(self, state): state.shape = (5, self.tile_coder.num_features) (self.critic.value.weights, self.thrust_actor.mu.weights, self.thrust_actor.sigma.weights, self.rcs_actor.mu.weights, self.rcs_actor.sigma.weights) = state def save_state (self, savefile='data/saved_state.npy'): np.save (savefile, self.get_state()) def load_state (self, savefile='data/saved_state.npy', mmap_mode=None): state = np.array (np.load (savefile, mmap_mode), copy=False) self.set_state(state) def persist_state(self, savefile=None, readonly=False): if savefile == None: state = np.frombuffer(mp.RawArray(ctypes.c_double, 5*self.tile_coder.num_features)) state[:] = self.get_state().flat self.set_state(state) else: if not readonly: self.save_state(savefile) self.load_state (savefile, mmap_mode='r' if readonly else 'r+')
class PolicyGradientAgent: def __init__(self, simulator, dt=0.5, Lambda=0.75, alpha_v=0.1, alpha_u=0.1, num_features=2**20, tile_weight_exponent=0.5, trunc_normal=True, subspaces=[1, 2, 6]): self.simulator = simulator self.dt = max(dt, self.simulator.dt) self.tile_coder = HashingTileCoder( self.make_tile_coder(tile_weight_exponent, subspaces), num_features) initial_thrust_sigma = simulator.max_thrust / 10 initial_thrust_mu = 0.5 initial_rcs_sigma = simulator.max_rcs / 6 initial_rcs_mu = 0.0 self.critic = Critic(self.tile_coder, Lambda, alpha_v, initial_value=1.0) self.thrust_actor = PolicyGradientActor( self.tile_coder, Lambda, alpha_u, min_action=0.0, max_action=simulator.max_thrust, min_sigma=simulator.max_thrust / 64, max_sigma=simulator.max_thrust / 2, initial_mu=initial_thrust_mu, initial_sigma=initial_thrust_sigma, trunc_normal=trunc_normal) self.rcs_actor = PolicyGradientActor(self.tile_coder, Lambda, alpha_u, min_action=-simulator.max_rcs, max_action=simulator.max_rcs, min_sigma=simulator.max_rcs / 32, max_sigma=simulator.max_rcs, initial_mu=initial_rcs_mu, initial_sigma=initial_rcs_sigma, trunc_normal=trunc_normal) def make_tile_coder(self, tile_weight_exponent, subspaces): # xpos ypos xvel yvel rot rotvel state_signed = np.array([False, False, True, True, True, True]) state_bounded = np.array([True, True, True, True, False, True]) tile_size = np.array([5., 5., 2., 2., math.pi / 2, math.pi / 6]) num_tiles = np.array([6, 4, 4, 4, 2, 3]) num_offsets = np.array([2, 2, 4, 4, 8, 4]) self.max_state = (tile_size * num_tiles) - 1e-8 self.min_state = -self.max_state self.min_state[np.logical_not(state_signed)] = 0.0 self.max_clip_state = self.max_state.copy() self.max_clip_state[np.logical_not(state_bounded)] = float('inf') self.min_clip_state = -self.max_clip_state self.min_clip_state[np.logical_not(state_signed)] = 0.0 num_tiles[state_signed] *= 2 num_tiles[state_bounded] += 1 return TileCoder(tile_size, num_tiles, num_offsets, subspaces, tile_weight_exponent) def compute_action(self, features): # def clamp (value, low, high): # value = low + math.fmod (abs(value-low), 2*(high-low)) # if value > high: value = 2*high - value # return value # thrust = clamp (self.thrust_actor.act(features), 0.0, self.simulator.max_thrust) # rcs = clamp (self.rcs_actor.act(features), -self.simulator.max_rcs, self.simulator.max_rcs) thrust = self.thrust_actor.act(features) rcs = self.rcs_actor.act(features) return (thrust, rcs) def initialize(self, state): features = self.tile_coder.indices( state.clip(self.min_clip_state, self.max_clip_state)) self.critic.initialize(features) self.thrust_actor.initialize() self.rcs_actor.initialize() return self.compute_action(features) def update(self, state, reward, terminal=False, learn=True): features = self.tile_coder.indices( state.clip(self.min_clip_state, self.max_clip_state)) if learn: td_error = self.critic.evaluate(features, reward, terminal) self.thrust_actor.learn(td_error) self.rcs_actor.learn(td_error) return self.compute_action(features) def get_state(self): return np.vstack( (self.critic.value.weights, self.thrust_actor.mu.weights, self.thrust_actor.sigma.weights, self.rcs_actor.mu.weights, self.rcs_actor.sigma.weights)) def set_state(self, state): state.shape = (5, self.tile_coder.num_features) (self.critic.value.weights, self.thrust_actor.mu.weights, self.thrust_actor.sigma.weights, self.rcs_actor.mu.weights, self.rcs_actor.sigma.weights) = state def save_state(self, savefile='data/saved_state.npy'): np.save(savefile, self.get_state()) def load_state(self, savefile='data/saved_state.npy', mmap_mode=None): state = np.array(np.load(savefile, mmap_mode), copy=False) self.set_state(state) def persist_state(self, savefile=None, readonly=False): if savefile == None: state = np.frombuffer( mp.RawArray(ctypes.c_double, 5 * self.tile_coder.num_features)) state[:] = self.get_state().flat self.set_state(state) else: if not readonly: self.save_state(savefile) self.load_state(savefile, mmap_mode='r' if readonly else 'r+')