예제 #1
0
 def __init__(self, hist_duration, mdp_step, time_step, action_size, batch_size, mean, std, hdg0, src_file,
              sim_time):
     self.mdp = MDP(hist_duration, mdp_step, time_step)
     self.action_size = action_size
     self.agent = PolicyLearner(self.mdp.size, action_size, batch_size)
     self.agent.load(src_file)
     self.wh = wind(mean, std, int(mdp_step / time_step))
     self.hdg0 = hdg0
     self.src = src_file
     self.sim_time = sim_time
예제 #2
0
    def playActor(self):
        self.load("NetworkParam/FinalParam")

        hdg0_rand_vec=[0,7,12]
        '''
        WIND CONDITIONS
        '''
        mean = 45 * TORAD
        std = 0.1 * TORAD
        wind_samples = 10
        w = wind(mean=mean, std=std, samples = wind_samples)

        try:
            for i in range(len(hdg0_rand_vec)):
                # Initial state
                WH = w.generateWind()
                hdg0_rand = hdg0_rand_vec[i]
                hdg0 = hdg0_rand * TORAD * np.ones(10)
                s = self.env.reset(hdg0,WH)
                episode_reward = 0
                episode_step=0
                v_episode=[]
                i_episode=[]
                while episode_step < 40: #not done:
                    if episode_step==0:
                        i_episode.append(hdg0_rand+WH[0]/TORAD-40)
                    else:
                        i_episode.append(s[0][-1]/TORAD)
                    s = np.reshape([s[0,:], s[1,:]], [self.state_size,1])

                    a, = self.sess.run(self.network.actions,
                                       feed_dict={self.network.state_ph: s[None]})
                    a = np.clip(a, self.low_bound, self.high_bound)
                    s_, r   = self.env.act(a,WH)
                    episode_reward += r
                    v_episode.append(r)
                    episode_step += 1
                    s = s_
                DISPLAYER.displayVI(v_episode,i_episode,i)
                print("Episode reward :", episode_reward," for incidence: ",hdg0_rand)

        except KeyboardInterrupt as e:
            pass

        except Exception as e:
            print("Exception :", e)

        finally:
            print("End of the demo")
예제 #3
0
import numpy as np
from math import *

from policyLearning import PolicyLearner

from Simulator import TORAD
from mdp import MDP
from environment import wind
'''
MDP Parameters
'''
mdp = MDP(duration_history=6, duration_simulation=1, delta_t=0.1)
'''
WIND CONDITIONS
'''
w = wind(mean=45 * TORAD, std=0 * TORAD, samples=10)
WH = w.generateWind()

hdg0 = 0 * np.ones(10)
mdp.initializeMDP(hdg0, WH)

agent = PolicyLearner(mdp.size, action_size=2, batch_size=32)
#agent.load("../Networks/epsilon_pi")
EPISODES = 1
count_luff = 0
count_bear_off = 0

loss_of_episode = []
i = []
v = []
r = []
예제 #4
0
MDP Parameters
'''
history_duration = 6  # Duration of state history [s]
mdp_step = 1  # Step between each state transition [s]
time_step = 0.1  # time step [s] <-> 10Hz frequency of data acquisition
lower_bound = -3.0
upper_bound = 3.0
mdp = ContinuousMDP(history_duration, mdp_step, time_step,lower_bound,upper_bound)

'''
WIND CONDITIONS
'''
mean = 45 * TORAD
std = 0 * TORAD
wind_samples = 10
w = wind(mean=mean, std=std, samples = wind_samples)
WH = w.generateWind()


'''
Random initial conditions 
'''
# hdg0_rand_vec=(0,2,4,6,8,10,13,15,17,20)
hdg0_rand_vec=(0,7,13)
action_size = 2
action_size_DDPG = 1

'''
Initialize Simulation
'''
예제 #5
0
    def play(self, sess, number_run, path=''):
        print("Playing", self.name, "for", number_run, "runs")

        with sess.as_default(), sess.graph.as_default():
            hdg0_rand_vec = [0, 7, 13]
            '''
            WIND CONDITIONS
            '''
            mean = 45 * TORAD
            std = 0 * TORAD
            wind_samples = 10
            w = wind(mean=mean, std=std, samples=wind_samples)

            try:
                for i in range(number_run):

                    # Reset the local network to the global
                    if self.name != 'global':
                        sess.run(self.update_local_vars)

                    WH = w.generateWind()
                    hdg0_rand = hdg0_rand_vec[i]
                    hdg0 = hdg0_rand * TORAD * np.ones(10)
                    s = self.env.reset(hdg0, WH)
                    episode_reward = 0
                    episode_step = 0
                    v_episode = []
                    i_episode = []
                    done = False

                    #self.lstm_state = self.network.lstm_state_init

                    while (not done and episode_step < 70):
                        i_episode.append(round(s[0][-1] / TORAD))
                        s = np.reshape([s[0, :], s[1, :]],
                                       [2 * self.state_size, 1])
                        # Prediction of the policy
                        feed_dict = {self.network.inputs: [s]}
                        policy, value = sess.run(
                            [self.network.policy, self.network.value],
                            feed_dict=feed_dict)

                        policy = policy[0]

                        # Choose an action according to the policy
                        action = np.random.choice([1.5, 0, -1.5], p=policy)
                        s_, r = self.env.act(action, WH)
                        if episode_step > 12:
                            if np.mean(v_episode[-4:]) > 0.8:
                                #done=True
                                print("Done!")
                            else:
                                done = False
                        episode_reward += r
                        v_episode.append(r)
                        episode_step += 1
                        s = s_
                    DISPLAYER.displayVI(v_episode, i_episode, i)

                    print("Episode reward :", episode_reward)

            except KeyboardInterrupt as e:
                pass

            finally:
                print("End of the demo")
예제 #6
0
    def work(self, sess, coord):
        print("Running", self.name, end='\n\n')
        self.starting_time = time()
        self.nb_ep = 1
        nearlyDone = 0
        with sess.as_default(), sess.graph.as_default():

            with coord.stop_on_exception():
                while not coord.should_stop():

                    self.states_buffer = []
                    self.actions_buffer = []
                    self.rewards_buffer = []
                    self.values_buffer = []
                    self.mean_values_buffer = []

                    self.total_steps = 0
                    episode_reward = 0
                    episode_step = 0

                    # Reset the local network to the global
                    sess.run(self.update_local_vars)

                    mean = 45 * TORAD
                    std = 0 * TORAD
                    wind_samples = 10
                    w = wind(mean=mean, std=std, samples=wind_samples)
                    WH = w.generateWind()
                    hdg0_rand = random.uniform(5, 12)
                    hdg0 = hdg0_rand * TORAD * np.ones(10)
                    s = self.env.reset(hdg0, WH)

                    done = False
                    #if self.worker_index == 1 and render and settings.DISPLAY:
                    #    self.env.set_render(True)

                    #self.lstm_state = self.network.lstm_state_init
                    #self.initial_lstm_state = self.lstm_state

                    while not coord.should_stop() and not done and \
                            episode_step < settings.MAX_EPISODE_STEP:

                        WH = np.random.uniform(mean - std,
                                               mean + std,
                                               size=wind_samples)
                        s = np.reshape([s[0, :], s[1, :]],
                                       [2 * self.state_size, 1])

                        # Prediction of the policy and the value
                        feed_dict = {self.network.inputs: [s]}
                        policy, value = sess.run(
                            [self.network.policy, self.network.value],
                            feed_dict=feed_dict)

                        policy, value = policy[0], value[0][0]

                        if random.random() < self.epsilon:
                            action = random.choice([1.5, 0, -1.5])

                        else:
                            # Choose an action according to the policy
                            action = np.random.choice([1.5, 0, -1.5], p=policy)

                        s_, v = self.env.act(action, WH)

                        #reward  assignation algorithm
                        if episode_step == 1:
                            r = 0
                        elif s[int(self.state_size / 2 - 2)] > (
                                13 *
                                TORAD) and s[int(self.state_size / 2 - 2)] < (
                                    15 * TORAD
                                ) and v > 0.63 and v < 0.67 and action < 0:
                            r = 0.5
                        else:
                            if v <= 0.69:
                                r = 0
                                nearlyDone = 0
                            elif v > 0.69 and v <= 0.75:
                                r = 0.00001
                                nearlyDone = 0
                            elif v > 0.75 and v <= 0.8:
                                r = 0.01
                                nearlyDone = 0
                            elif v > 0.80:
                                r = 0.1
                                if nearlyDone >= 3:
                                    r = 1
                                    done = True
                                elif nearlyDone == 2:
                                    r = 0.8
                                elif nearlyDone == 1:
                                    r = 0.25
                                nearlyDone = nearlyDone + 1
                            else:
                                r = 0
                                nearlyDone = False

                        #s_ = np.reshape(s_, [2*self.state_size,1])

                        # Store the experience
                        self.states_buffer.append(s)
                        self.actions_buffer.append(action)
                        self.rewards_buffer.append(r)
                        self.values_buffer.append(value)
                        self.mean_values_buffer.append(value)
                        episode_reward += r
                        s = s_

                        episode_step += 1
                        self.total_steps += 1

                        # If we have more than MAX_LEN_BUFFER experiences, we
                        # apply the gradients and update the global network,
                        # then we empty the episode buffers
                        if len(self.states_buffer) == settings.MAX_LEN_BUFFER \
                                and not done:

                            feed_dict = {
                                self.network.inputs: [
                                    np.reshape([s[0, :], s[1, :]],
                                               [2 * self.state_size, 1])
                                ]
                            }
                            bootstrap_value = sess.run(self.network.value,
                                                       feed_dict=feed_dict)

                            self.train(sess, bootstrap_value
                                       )  #with this we change global network
                            sess.run(self.update_local_vars)
                            #self.initial_lstm_state = self.lstm_state

                    if len(self.states_buffer) != 0:
                        if done:
                            bootstrap_value = 0
                        else:
                            feed_dict = {
                                self.network.inputs: [
                                    np.reshape([s[0, :], s[1, :]],
                                               [2 * self.state_size, 1])
                                ]
                            }
                            bootstrap_value = sess.run(self.network.value,
                                                       feed_dict=feed_dict)
                        self.train(sess, bootstrap_value)

                    if self.epsilon > settings.EPSILON_STOP:
                        self.epsilon -= settings.EPSILON_DECAY

                    self.nb_ep += 1

                    if not coord.should_stop():
                        DISPLAYER.add_reward(episode_reward, self.worker_index)

                    if (self.worker_index == 1 and
                            self.nb_ep % settings.DISP_EP_REWARD_FREQ == 0):
                        print(
                            'Episode %2i, Initial hdg: %2i, Reward: %7.3f, Steps: %i, '
                            'Epsilon: %7.3f' %
                            (self.nb_ep, hdg0_rand, episode_reward,
                             episode_step, self.epsilon))
                        print("Policy: ", policy)
                    if (self.worker_index == 1
                            and self.nb_ep % settings.SAVE_FREQ == 0):
                        self.save(self.total_steps)

                    if time() - self.starting_time > settings.LIMIT_RUN_TIME:
                        coord.request_stop()

            self.summary_writer.close()
예제 #7
0
    def run(self):
        #self.load("NetworkParam_best_ThirdSemester/FinalParam") #get the best parameters to start the training
        self.total_steps = 0

        '''
        WIND CONDITIONS
        '''
        mean = 45 * TORAD
        std = 0.1 * TORAD
        wind_samples = 10
        w = wind(mean=mean, std=std, samples = wind_samples)
        WH = w.generateWind()

        for ep in range(1, parameters.TRAINING_STEPS+1):

            episode_reward = 0
            episode_step = 0
            nearlyDone=0
            done=False

            # Initialize exploration noise process
            noise_process = np.zeros(self.action_size)
            noise_scale = (parameters.NOISE_SCALE_INIT *
                           parameters.NOISE_DECAY**ep) * \
                (self.high_bound - self.low_bound)

            # Initial state
            w = wind(mean=mean, std=std, samples = wind_samples)
            WH = w.generateWind()
            hdg0_rand = random.uniform(6,13) 
            hdg0 = hdg0_rand * TORAD * np.ones(10)
            s = self.env.reset(hdg0,WH)
            
            while episode_step < parameters.MAX_EPISODE_STEPS: #and not done:

                WH = np.random.uniform(mean - std, mean + std, size=wind_samples)

                # choose action based on deterministic policy
                s = np.reshape([s[0,:], s[1,:]], [self.state_size,1])
                a, = self.sess.run(self.network.actions,
                                   feed_dict={self.network.state_ph: s[None]})

                # add temporally-correlated exploration noise to action
                # (using an Ornstein-Uhlenbeck process)
                noise_process = parameters.EXPLO_THETA * \
                    (parameters.EXPLO_MU - noise_process) + \
                    parameters.EXPLO_SIGMA * np.random.randn(self.action_size)
                a += noise_scale * noise_process
                #to respect the bounds:
                a = np.clip(a, self.low_bound, self.high_bound)
                
                s_, v  = self.env.act(a,WH)
                
                #reward  assignation algorithm
                if episode_step==1:
                    r=0
                #elif s[int(self.state_size/2-2)]>(13*TORAD) and s[int(self.state_size/2-2)]<(15*TORAD) and v>0.63 and v<0.67 and a<0:
                #    r=0.1
                else:
                    if v<=0.69:
                        r=0
                        nearlyDone = 0
                    elif v>0.69 and v<=0.75:
                        r=0.00001
                        nearlyDone = 0
                    elif v>0.75 and v<=0.8:
                        r=0.01
                        nearlyDone = 0
                    elif v>0.80:
                        r=0.1
                        if nearlyDone>=3:
                            r=1
                            done = True
                        elif nearlyDone==2:
                            r=0.8
                        elif nearlyDone==1:
                            r=0.25
                        nearlyDone=nearlyDone+1
                    else:
                        r=0
                        nearlyDone = False

                episode_reward += r

                self.buffer.add((s, np.reshape(a, [1,1] ), r, np.reshape(s_, [self.state_size,1]), 0.0 if episode_step<parameters.MAX_EPISODE_STEPS-1 else 1.0)) #, 0.0 if done else 1.0

                # update network weights to fit a minibatch of experience
                if self.total_steps % parameters.TRAINING_FREQ == 0 and \
                        len(self.buffer) >= parameters.BATCH_SIZE:

                    minibatch = self.buffer.sample()

                    _, _,critic_loss = self.sess.run([self.network.critic_train_op, self.network.actor_train_op,self.network.critic_loss],
                                         feed_dict={
                        self.network.state_ph: np.asarray([elem[0] for elem in minibatch]),
                        self.network.action_ph: np.asarray([elem[1] for elem in minibatch]),
                        self.network.reward_ph: np.asarray([elem[2] for elem in minibatch]),
                        self.network.next_state_ph: np.asarray([elem[3] for elem in minibatch]),
                        self.network.is_not_terminal_ph: np.asarray([elem[4] for elem in minibatch])})

                    # update target networks
                    _ = self.sess.run(self.network.update_slow_targets_op)

                s = s_
                episode_step += 1
                self.total_steps += 1
            if ep % parameters.DISP_EP_REWARD_FREQ == 0:
                print('Episode %2i, initial heading: %7.3f, Reward: %7.3f, Final noise scale: %7.3f, critic loss: %7.3f' %
                      (ep, hdg0[0]*(1/TORAD), episode_reward, noise_scale,critic_loss))
            DISPLAYER.add_reward(episode_reward)
            # We save CNN weights every 500 epochs
            if ep % 500 == 0 and ep != 0:
                self.save("NetworkParam/"+ str(ep) +"_epochs")
        self.save("NetworkParam/"+"FinalParam")
예제 #8
0
    def playCritic(self):
        self.load("NetworkParam/FinalParam")

        hdg0_rand_vec=[0,7,12]
        '''
        WIND CONDITIONS
        '''
        mean = 45 * TORAD
        std = 0.1 * TORAD
        wind_samples = 10
        w = wind(mean=mean, std=std, samples = wind_samples)

        try:
            for i in range(len(hdg0_rand_vec)):
                # Initial state
                WH = w.generateWind()
                hdg0_rand = hdg0_rand_vec[i]
                hdg0 = hdg0_rand * TORAD * np.ones(10)
                s = self.env.reset(hdg0,WH)
                
                episode_reward = 0
                episode_step=0
                v_episode=[]
                i_episode=[]
                while episode_step < 30: #not done:
                    if episode_step==0:
                        i_episode.append(hdg0_rand+WH[0]/TORAD-40)
                    else:
                        i_episode.append(s[0][-1]/TORAD)
                    
                    # Critic policy
                    critic = [self.evaluate(s, -1.5),self.evaluate(s, -1.25),self.evaluate(s, -1),
                        self.evaluate(s, -0.75),self.evaluate(s, -0.5),self.evaluate(s, -0.25),self.evaluate(s, 0),self.evaluate(s, 0.25),
                            self.evaluate(s, 0.5),self.evaluate(s, 0.75),self.evaluate(s, 1),self.evaluate(s, 1.25),
                            self.evaluate(s, 1.5)]
                    a = np.argmax(critic)
                    if a == 0:
                        a = -1.5
                    if a == 1:
                        a = -1.25
                    if a == 2:
                        a = -1
                    if a == 3:
                        a = -0.75
                    if a == 4:
                        a = -0.5
                    if a == 5:
                        a = -0.25
                    if a == 6:
                        a = 0
                    if a == 7:
                        a = 0.25
                    if a == 8:
                        a = 0.5
                    if a == 9:
                        a = 0.75
                    if a == 10:
                        a = 1
                    if a == 11:
                        a = 1.25
                    if a == 12:
                        a = 1.5

                    s_, r   = self.env.act(a,WH)
                    episode_reward += r
                    v_episode.append(r)
                    episode_step += 1
                    s = s_
                DISPLAYER.displayVI(v_episode,i_episode,i+3)
                print("Episode reward :", episode_reward," for incidence: ",hdg0_rand)

        except KeyboardInterrupt as e:
            pass

        except Exception as e:
            print("Exception :", e)

        finally:
            print("End of the demo")