def __init__(self, hist_duration, mdp_step, time_step, action_size, batch_size, mean, std, hdg0, src_file, sim_time): self.mdp = MDP(hist_duration, mdp_step, time_step) self.action_size = action_size self.agent = PolicyLearner(self.mdp.size, action_size, batch_size) self.agent.load(src_file) self.wh = wind(mean, std, int(mdp_step / time_step)) self.hdg0 = hdg0 self.src = src_file self.sim_time = sim_time
def playActor(self): self.load("NetworkParam/FinalParam") hdg0_rand_vec=[0,7,12] ''' WIND CONDITIONS ''' mean = 45 * TORAD std = 0.1 * TORAD wind_samples = 10 w = wind(mean=mean, std=std, samples = wind_samples) try: for i in range(len(hdg0_rand_vec)): # Initial state WH = w.generateWind() hdg0_rand = hdg0_rand_vec[i] hdg0 = hdg0_rand * TORAD * np.ones(10) s = self.env.reset(hdg0,WH) episode_reward = 0 episode_step=0 v_episode=[] i_episode=[] while episode_step < 40: #not done: if episode_step==0: i_episode.append(hdg0_rand+WH[0]/TORAD-40) else: i_episode.append(s[0][-1]/TORAD) s = np.reshape([s[0,:], s[1,:]], [self.state_size,1]) a, = self.sess.run(self.network.actions, feed_dict={self.network.state_ph: s[None]}) a = np.clip(a, self.low_bound, self.high_bound) s_, r = self.env.act(a,WH) episode_reward += r v_episode.append(r) episode_step += 1 s = s_ DISPLAYER.displayVI(v_episode,i_episode,i) print("Episode reward :", episode_reward," for incidence: ",hdg0_rand) except KeyboardInterrupt as e: pass except Exception as e: print("Exception :", e) finally: print("End of the demo")
import numpy as np from math import * from policyLearning import PolicyLearner from Simulator import TORAD from mdp import MDP from environment import wind ''' MDP Parameters ''' mdp = MDP(duration_history=6, duration_simulation=1, delta_t=0.1) ''' WIND CONDITIONS ''' w = wind(mean=45 * TORAD, std=0 * TORAD, samples=10) WH = w.generateWind() hdg0 = 0 * np.ones(10) mdp.initializeMDP(hdg0, WH) agent = PolicyLearner(mdp.size, action_size=2, batch_size=32) #agent.load("../Networks/epsilon_pi") EPISODES = 1 count_luff = 0 count_bear_off = 0 loss_of_episode = [] i = [] v = [] r = []
MDP Parameters ''' history_duration = 6 # Duration of state history [s] mdp_step = 1 # Step between each state transition [s] time_step = 0.1 # time step [s] <-> 10Hz frequency of data acquisition lower_bound = -3.0 upper_bound = 3.0 mdp = ContinuousMDP(history_duration, mdp_step, time_step,lower_bound,upper_bound) ''' WIND CONDITIONS ''' mean = 45 * TORAD std = 0 * TORAD wind_samples = 10 w = wind(mean=mean, std=std, samples = wind_samples) WH = w.generateWind() ''' Random initial conditions ''' # hdg0_rand_vec=(0,2,4,6,8,10,13,15,17,20) hdg0_rand_vec=(0,7,13) action_size = 2 action_size_DDPG = 1 ''' Initialize Simulation '''
def play(self, sess, number_run, path=''): print("Playing", self.name, "for", number_run, "runs") with sess.as_default(), sess.graph.as_default(): hdg0_rand_vec = [0, 7, 13] ''' WIND CONDITIONS ''' mean = 45 * TORAD std = 0 * TORAD wind_samples = 10 w = wind(mean=mean, std=std, samples=wind_samples) try: for i in range(number_run): # Reset the local network to the global if self.name != 'global': sess.run(self.update_local_vars) WH = w.generateWind() hdg0_rand = hdg0_rand_vec[i] hdg0 = hdg0_rand * TORAD * np.ones(10) s = self.env.reset(hdg0, WH) episode_reward = 0 episode_step = 0 v_episode = [] i_episode = [] done = False #self.lstm_state = self.network.lstm_state_init while (not done and episode_step < 70): i_episode.append(round(s[0][-1] / TORAD)) s = np.reshape([s[0, :], s[1, :]], [2 * self.state_size, 1]) # Prediction of the policy feed_dict = {self.network.inputs: [s]} policy, value = sess.run( [self.network.policy, self.network.value], feed_dict=feed_dict) policy = policy[0] # Choose an action according to the policy action = np.random.choice([1.5, 0, -1.5], p=policy) s_, r = self.env.act(action, WH) if episode_step > 12: if np.mean(v_episode[-4:]) > 0.8: #done=True print("Done!") else: done = False episode_reward += r v_episode.append(r) episode_step += 1 s = s_ DISPLAYER.displayVI(v_episode, i_episode, i) print("Episode reward :", episode_reward) except KeyboardInterrupt as e: pass finally: print("End of the demo")
def work(self, sess, coord): print("Running", self.name, end='\n\n') self.starting_time = time() self.nb_ep = 1 nearlyDone = 0 with sess.as_default(), sess.graph.as_default(): with coord.stop_on_exception(): while not coord.should_stop(): self.states_buffer = [] self.actions_buffer = [] self.rewards_buffer = [] self.values_buffer = [] self.mean_values_buffer = [] self.total_steps = 0 episode_reward = 0 episode_step = 0 # Reset the local network to the global sess.run(self.update_local_vars) mean = 45 * TORAD std = 0 * TORAD wind_samples = 10 w = wind(mean=mean, std=std, samples=wind_samples) WH = w.generateWind() hdg0_rand = random.uniform(5, 12) hdg0 = hdg0_rand * TORAD * np.ones(10) s = self.env.reset(hdg0, WH) done = False #if self.worker_index == 1 and render and settings.DISPLAY: # self.env.set_render(True) #self.lstm_state = self.network.lstm_state_init #self.initial_lstm_state = self.lstm_state while not coord.should_stop() and not done and \ episode_step < settings.MAX_EPISODE_STEP: WH = np.random.uniform(mean - std, mean + std, size=wind_samples) s = np.reshape([s[0, :], s[1, :]], [2 * self.state_size, 1]) # Prediction of the policy and the value feed_dict = {self.network.inputs: [s]} policy, value = sess.run( [self.network.policy, self.network.value], feed_dict=feed_dict) policy, value = policy[0], value[0][0] if random.random() < self.epsilon: action = random.choice([1.5, 0, -1.5]) else: # Choose an action according to the policy action = np.random.choice([1.5, 0, -1.5], p=policy) s_, v = self.env.act(action, WH) #reward assignation algorithm if episode_step == 1: r = 0 elif s[int(self.state_size / 2 - 2)] > ( 13 * TORAD) and s[int(self.state_size / 2 - 2)] < ( 15 * TORAD ) and v > 0.63 and v < 0.67 and action < 0: r = 0.5 else: if v <= 0.69: r = 0 nearlyDone = 0 elif v > 0.69 and v <= 0.75: r = 0.00001 nearlyDone = 0 elif v > 0.75 and v <= 0.8: r = 0.01 nearlyDone = 0 elif v > 0.80: r = 0.1 if nearlyDone >= 3: r = 1 done = True elif nearlyDone == 2: r = 0.8 elif nearlyDone == 1: r = 0.25 nearlyDone = nearlyDone + 1 else: r = 0 nearlyDone = False #s_ = np.reshape(s_, [2*self.state_size,1]) # Store the experience self.states_buffer.append(s) self.actions_buffer.append(action) self.rewards_buffer.append(r) self.values_buffer.append(value) self.mean_values_buffer.append(value) episode_reward += r s = s_ episode_step += 1 self.total_steps += 1 # If we have more than MAX_LEN_BUFFER experiences, we # apply the gradients and update the global network, # then we empty the episode buffers if len(self.states_buffer) == settings.MAX_LEN_BUFFER \ and not done: feed_dict = { self.network.inputs: [ np.reshape([s[0, :], s[1, :]], [2 * self.state_size, 1]) ] } bootstrap_value = sess.run(self.network.value, feed_dict=feed_dict) self.train(sess, bootstrap_value ) #with this we change global network sess.run(self.update_local_vars) #self.initial_lstm_state = self.lstm_state if len(self.states_buffer) != 0: if done: bootstrap_value = 0 else: feed_dict = { self.network.inputs: [ np.reshape([s[0, :], s[1, :]], [2 * self.state_size, 1]) ] } bootstrap_value = sess.run(self.network.value, feed_dict=feed_dict) self.train(sess, bootstrap_value) if self.epsilon > settings.EPSILON_STOP: self.epsilon -= settings.EPSILON_DECAY self.nb_ep += 1 if not coord.should_stop(): DISPLAYER.add_reward(episode_reward, self.worker_index) if (self.worker_index == 1 and self.nb_ep % settings.DISP_EP_REWARD_FREQ == 0): print( 'Episode %2i, Initial hdg: %2i, Reward: %7.3f, Steps: %i, ' 'Epsilon: %7.3f' % (self.nb_ep, hdg0_rand, episode_reward, episode_step, self.epsilon)) print("Policy: ", policy) if (self.worker_index == 1 and self.nb_ep % settings.SAVE_FREQ == 0): self.save(self.total_steps) if time() - self.starting_time > settings.LIMIT_RUN_TIME: coord.request_stop() self.summary_writer.close()
def run(self): #self.load("NetworkParam_best_ThirdSemester/FinalParam") #get the best parameters to start the training self.total_steps = 0 ''' WIND CONDITIONS ''' mean = 45 * TORAD std = 0.1 * TORAD wind_samples = 10 w = wind(mean=mean, std=std, samples = wind_samples) WH = w.generateWind() for ep in range(1, parameters.TRAINING_STEPS+1): episode_reward = 0 episode_step = 0 nearlyDone=0 done=False # Initialize exploration noise process noise_process = np.zeros(self.action_size) noise_scale = (parameters.NOISE_SCALE_INIT * parameters.NOISE_DECAY**ep) * \ (self.high_bound - self.low_bound) # Initial state w = wind(mean=mean, std=std, samples = wind_samples) WH = w.generateWind() hdg0_rand = random.uniform(6,13) hdg0 = hdg0_rand * TORAD * np.ones(10) s = self.env.reset(hdg0,WH) while episode_step < parameters.MAX_EPISODE_STEPS: #and not done: WH = np.random.uniform(mean - std, mean + std, size=wind_samples) # choose action based on deterministic policy s = np.reshape([s[0,:], s[1,:]], [self.state_size,1]) a, = self.sess.run(self.network.actions, feed_dict={self.network.state_ph: s[None]}) # add temporally-correlated exploration noise to action # (using an Ornstein-Uhlenbeck process) noise_process = parameters.EXPLO_THETA * \ (parameters.EXPLO_MU - noise_process) + \ parameters.EXPLO_SIGMA * np.random.randn(self.action_size) a += noise_scale * noise_process #to respect the bounds: a = np.clip(a, self.low_bound, self.high_bound) s_, v = self.env.act(a,WH) #reward assignation algorithm if episode_step==1: r=0 #elif s[int(self.state_size/2-2)]>(13*TORAD) and s[int(self.state_size/2-2)]<(15*TORAD) and v>0.63 and v<0.67 and a<0: # r=0.1 else: if v<=0.69: r=0 nearlyDone = 0 elif v>0.69 and v<=0.75: r=0.00001 nearlyDone = 0 elif v>0.75 and v<=0.8: r=0.01 nearlyDone = 0 elif v>0.80: r=0.1 if nearlyDone>=3: r=1 done = True elif nearlyDone==2: r=0.8 elif nearlyDone==1: r=0.25 nearlyDone=nearlyDone+1 else: r=0 nearlyDone = False episode_reward += r self.buffer.add((s, np.reshape(a, [1,1] ), r, np.reshape(s_, [self.state_size,1]), 0.0 if episode_step<parameters.MAX_EPISODE_STEPS-1 else 1.0)) #, 0.0 if done else 1.0 # update network weights to fit a minibatch of experience if self.total_steps % parameters.TRAINING_FREQ == 0 and \ len(self.buffer) >= parameters.BATCH_SIZE: minibatch = self.buffer.sample() _, _,critic_loss = self.sess.run([self.network.critic_train_op, self.network.actor_train_op,self.network.critic_loss], feed_dict={ self.network.state_ph: np.asarray([elem[0] for elem in minibatch]), self.network.action_ph: np.asarray([elem[1] for elem in minibatch]), self.network.reward_ph: np.asarray([elem[2] for elem in minibatch]), self.network.next_state_ph: np.asarray([elem[3] for elem in minibatch]), self.network.is_not_terminal_ph: np.asarray([elem[4] for elem in minibatch])}) # update target networks _ = self.sess.run(self.network.update_slow_targets_op) s = s_ episode_step += 1 self.total_steps += 1 if ep % parameters.DISP_EP_REWARD_FREQ == 0: print('Episode %2i, initial heading: %7.3f, Reward: %7.3f, Final noise scale: %7.3f, critic loss: %7.3f' % (ep, hdg0[0]*(1/TORAD), episode_reward, noise_scale,critic_loss)) DISPLAYER.add_reward(episode_reward) # We save CNN weights every 500 epochs if ep % 500 == 0 and ep != 0: self.save("NetworkParam/"+ str(ep) +"_epochs") self.save("NetworkParam/"+"FinalParam")
def playCritic(self): self.load("NetworkParam/FinalParam") hdg0_rand_vec=[0,7,12] ''' WIND CONDITIONS ''' mean = 45 * TORAD std = 0.1 * TORAD wind_samples = 10 w = wind(mean=mean, std=std, samples = wind_samples) try: for i in range(len(hdg0_rand_vec)): # Initial state WH = w.generateWind() hdg0_rand = hdg0_rand_vec[i] hdg0 = hdg0_rand * TORAD * np.ones(10) s = self.env.reset(hdg0,WH) episode_reward = 0 episode_step=0 v_episode=[] i_episode=[] while episode_step < 30: #not done: if episode_step==0: i_episode.append(hdg0_rand+WH[0]/TORAD-40) else: i_episode.append(s[0][-1]/TORAD) # Critic policy critic = [self.evaluate(s, -1.5),self.evaluate(s, -1.25),self.evaluate(s, -1), self.evaluate(s, -0.75),self.evaluate(s, -0.5),self.evaluate(s, -0.25),self.evaluate(s, 0),self.evaluate(s, 0.25), self.evaluate(s, 0.5),self.evaluate(s, 0.75),self.evaluate(s, 1),self.evaluate(s, 1.25), self.evaluate(s, 1.5)] a = np.argmax(critic) if a == 0: a = -1.5 if a == 1: a = -1.25 if a == 2: a = -1 if a == 3: a = -0.75 if a == 4: a = -0.5 if a == 5: a = -0.25 if a == 6: a = 0 if a == 7: a = 0.25 if a == 8: a = 0.5 if a == 9: a = 0.75 if a == 10: a = 1 if a == 11: a = 1.25 if a == 12: a = 1.5 s_, r = self.env.act(a,WH) episode_reward += r v_episode.append(r) episode_step += 1 s = s_ DISPLAYER.displayVI(v_episode,i_episode,i+3) print("Episode reward :", episode_reward," for incidence: ",hdg0_rand) except KeyboardInterrupt as e: pass except Exception as e: print("Exception :", e) finally: print("End of the demo")