#w1 = fb._w1(x) #print("w1: ", w1) #w2 = fb._w2(x) #print("w2: ", w2) #mu = fb.feedback(x, v) #nu = fb.noisy_feedback(x, v) #print("mean u = ", mu) #print("noisy u = ", nu) #lp = fb.log_prob(u, x, v) #print("log prob of [1, 1] is ", lp) # Generate v, y desired and check that we can match with no model mismatch. from reinforce import Reinforce r = Reinforce(1, 1, 1, 1, 100, dyn, None, fb) current_x = np.array([[0.1], [0.0], [0.1], [0.0]]) vs = r._generate_v() y_desireds = r._generate_y(current_x, vs) ys = [] print("current x: ", current_x) for v, y_desired in zip(vs, y_desireds): u = dyn._f_q(current_x) + dyn._M_q(current_x) @ v current_x = dyn.integrate(current_x, u, dt=0.001) ys.append(dyn.observation(current_x)) print("ys: ", ys) print("y_desireds:", y_desireds)
class DoublePendulumEnv(gym.Env): def __init__(self, stepsPerRollout, rewardScaling, dynamicsScaling, preprocessState, uscaling, largerQ): #calling init method of parent class super(DoublePendulumEnv, self).__init__() #setting local parameters self._preprocess_state = preprocessState self._num_steps_per_rollout = stepsPerRollout self._reward_scaling = rewardScaling self._norm = 2 self._time_step = 0.01 self._uscaling = uscaling self._largerQ = largerQ #setting action space dimensions so agent knows output size self.action_space = spaces.Box(low=-50, high=50, shape=(6, ), dtype=np.float32) #setting observation space dimensions so agent knows input size if (self._preprocess_state): self.observation_space = spaces.Box(low=-100, high=100, shape=(6, ), dtype=np.float32) else: self.observation_space = spaces.Box(low=-100, high=100, shape=(4, ), dtype=np.float32) # TODO: these should match what we get from system identification, once # we have reliable numbers there. #setting parameters of quadrotor and creating dynamics object mass1 = 1.0 mass2 = 1.0 length1 = 1.0 length2 = 1.0 self._dynamics = DoublePendulum(mass1, mass2, length1, length2, self._time_step) #creating bad dynamics object scaling = 0.000001 + dynamicsScaling self._bad_dynamics = DoublePendulum(scaling * mass1, scaling * mass2, scaling * length1, scaling * length2, self._time_step) #setting other local variables self.A, self.B, C = self._dynamics.linearized_system() self._count = 0 self._xdim = self._dynamics.xdim self._udim = self._dynamics.udim self._M1, self._f1 = self._bad_dynamics.feedback_linearize() self._iter_count = 0 def step(self, u): #compute v based on basic control law diff = self._dynamics.linear_system_state_delta( self._reference[self._count], self._current_y) v = -self._K @ (diff) #output of neural network m2, f2 = np.split(self._uscaling * u, [4]) M = self._bad_dynamics._M_q(self._state) + np.reshape( m2, (self._udim, self._udim)) f = self._bad_dynamics._f_q(self._state) + np.reshape( f2, (self._udim, 1)) z = np.dot(M, v) + f self._state = self._dynamics.integrate(self._state, z, self._time_step) self._current_y = self._dynamics.linearized_system_state(self._state) reward = self.computeReward(self._y_desired[self._count], self._current_y) #Increasing count self._count += 1 #computing observations, rewards, done, info??? done = False if (self._count >= self._num_steps_per_rollout): done = True #formatting observation list = [] for x in self._state: list.append(x[0]) observation = list #preprocessing observation if (self._preprocess_state): observation = self.preprocess_state(observation) #returning stuff return np.array(observation), reward, done, {} def reset(self): #(0) Sample state using state smapler method self._state = self.initial_state_sampler() # (1) Generate a time series for v and corresponding y. self._reference, self._K = self._generate_reference(self._state) self._y_desired = self._generate_ys(self._state, self._reference, self._K) self._current_y = self._dynamics.linearized_system_state(self._state) #reset internal count self._count = 0 #formatting observation list = [] for x in self._state: list.append(x[0]) observation = list #preprocessing state if (self._preprocess_state): observation = self.preprocess_state(observation) return np.array(observation) def seed(self, s): np.random.seed(np.random.randomint()) def render(self): # TODO! pass def initial_state_sampler(self): lower = np.array([[-np.pi], [-0.5], [-np.pi], [-0.5]]) upper = -lower return np.random.uniform(lower, upper) def _generate_reference(self, x0): """ Use sinusoid with random frequency, amplitude, and bias: ``` vi(k) = a * sin(2 * pi * f * k) + b ``` """ MAX_CONTINUOUS_TIME_FREQ = 0.1 MAX_DISCRETE_TIME_FREQ = MAX_CONTINUOUS_TIME_FREQ * self._dynamics._time_step linsys_xdim = self.A.shape[0] linsys_udim = self.B.shape[1] #random scaling factor for Q based on how many iterations have been done if (self._largerQ): Q = 50 * (np.random.uniform() + 0.1) * np.eye(linsys_xdim) else: Q = 10 * np.eye(linsys_xdim) #fixed Q scaling # Q = 1.0 * np.diag([1.0, 0.0, 0.0, 0.0, # 1.0, 0.0, 0.0, 0.0, # 1.0, 0.0, 0.0, 0.0, # 1.0, 0.0]) #fixed R scaling R = 1.0 * np.eye(linsys_udim) # Initial y. y0 = self._dynamics.linearized_system_state(x0) y = np.empty((linsys_xdim, self._num_steps_per_rollout)) for ii in range(linsys_xdim): y[ii, :] = np.linspace( 0, self._num_steps_per_rollout * self._dynamics._time_step, self._num_steps_per_rollout) y[ii, :] = y0[ii, 0] + 1.0 * np.random.uniform() * (1.0 - np.cos( 2.0 * np.pi * MAX_DISCRETE_TIME_FREQ * \ np.random.uniform() * y[ii, :])) #+ 0.1 * np.random.normal() # Ensure that y ref starts at y0. assert (np.allclose(y[:, 0].flatten(), y0.flatten(), 1e-5)) P = solve_continuous_are(self.A, self.B, Q, R) K = np.linalg.inv(R) @ self.B.T @ P return (np.split(y, indices_or_sections=self._num_steps_per_rollout, axis=1), K) def _generate_ys(self, x0, refs, K): """ Compute desired output sequence given initial state and input sequence. This is computed by applying the true dynamics' feedback linearization. """ x = x0.copy() y = self._dynamics.linearized_system_state(x) ys = [] for r in refs: diff = self._dynamics.linear_system_state_delta(r, y) v = -K @ diff u = self._dynamics.feedback(x, v) x = self._dynamics.integrate(x, u) y = self._dynamics.linearized_system_state(x) ys.append(y.copy()) return ys def computeReward(self, y_desired, y): return -self._reward_scaling * self._dynamics.observation_distance( y_desired, y, self._norm) def close(self): pass def preprocess_state(self, x): return self._dynamics.preprocess_state(x)
obs = np.array(dyn.preprocess_state(x)) # obs = x u = sess.run(tf.get_default_graph().get_tensor_by_name('pi/add:0'),{tf.get_default_graph().get_tensor_by_name('Placeholder:0'): obs.reshape(1,-1)}) #output of neural network u = u[0] m2, f2 = np.split(u,[4]) M = bad_dyn._M_q(x) + np.reshape(m2,(2, 2)) f = bad_dyn._f_q(x) + np.reshape(f2,(2, 1)) control = np.matmul(M, v) + f learned_controls_path[:,t]=control[:,0] x=dyn.integrate(x,control) learned_path[:,t+1]=(diff)[:,0] if check_energy: print(dyn.total_energy(x)) if to_render and t%2==0: dyn.render(x,speed,path,float(t+0.2*T)/(1.5*T)) if nominal: x=x0.copy() dyn.fig=None for t in range(T): y=dyn.observation(x) ydot=dyn.observation_dot(x)
import torch import numpy as np from double_pendulum import DoublePendulum dyn = DoublePendulum(1.0, 1.0, 1.0, 1.0, time_step=0.01) u = np.array([[0.0], [0.0]]) x = np.array([[0.1], [0.0], [0.1], [0.0]]) current_x = x.copy() for ii in range(10): current_x = dyn.integrate(current_x, u) print(current_x.T)