Пример #1
0
def start_test(goal_position):
    """
    During the test phase the agents use only the action that are predicted from the actor network, well trained.
    In practice every step an action is predicted anbd a reward is done for that action.
    If the network is well trained is must be able every time to maximize the reward in 
    order to reach the goal 
    """

    debug = True
    env = Environment(
        debug, goal_position
    )  #Put here all teh function needed for the interaction with the env

    observ_dim = env.num_states
    actions_dim = env.num_actions
    #Define Hyperparameters values
    gamma = 0.98  #learning parameter --> discount factor: model the fact that future reward are worth less than immediate reward
    #MQ value factor, if settled near 1 means tha learning is quickly
    tau = 0.001  # neural networks updating

    #goal_position = [5.0, -3.0]

    max_episode = 30
    max_steps = 1000
    reward = 0
    terminal = [False]
    save_stats = True
    #Say to tensorflow to run on CPU
    config = tf.ConfigProto(device_count={'GPU': 0})
    sess = tf.Session(config=config)
    K.set_session(sess)

    #write directory where find model saved
    load_path = '/home/parallels/catkin_ws/src/deep_drone/src/Data_Saved'  #/Model_Weights_saved'
    load_directory = os.path.join(os.getcwd(), load_path)

    if not os.path.isdir(
            load_directory):  #return true if path is in an existing directory
        os.makedirs(load_directory)
    os.chdir(load_directory)

    mean_reward = []
    std_reward = []
    ep_reward = []
    episode = []
    distance = []
    #Load Model to test every 30 episode.
    #Load Model 200, 230 ecc to 500
    for i in range(299, 499, 20):
        print(i)
        #Load actor and critic model
        #        actor_model = '/home/parallels/catkin_ws/src/deep_drone/src/Data_Saved/Actor_weights/%d_actor_model.h5' %(i)
        #        critic_model = '/home/parallels/catkin_ws/src/deep_drone/src/Data_Saved/Critic_weights/%d_critic_model.h5' %(i)
        #
        #        path_actor = os.path.join(load_directory, actor_model)
        #        path_critic = os.path.join(load_directory, critic_model)
        #        print(path_actor)
        #        try:
        #            actor = load_model(actor_model)
        #            critic = load_model(critic_model)
        #            print('actor', actor)
        #            print('Model weight succesfully')
        #        except:
        #            print('ERROR: Model weight not succesfully')

        actor = ActorNetwork(env, sess)
        #critic = CriticNetwork(env,sess)
        actor_model = '/home/parallels/catkin_ws/src/deep_drone/src/Data_Saved/Actor_weights/%d_actor_model.h5' % (
            i)
        # critic_model = '/home/parallels/catkin_ws/src/deep_drone/src/Data_Saved/Critic_weights/%d_critic_model.h5' %(i)
        try:
            actor.model.load_weights(actor_model)
            actor.model_target.load_weights(actor_model)
            #critic.model.load_weights(critic_model)
            #critic.model_target.load_weights(critic_model)

            print("WEIGHTS LOAD CORRECTLY")
        except:
            print("ERR: WEIGHTS LOAD UNCORRECTLY")

        episode_reward = []
        #model_num.append(i)
        episode_check = 0
        desired_checking_episode = 20

        for ep in range(max_episode):

            #Take initial observation as in training phase
            state_t = env._reset(
            )  #reset environment ---> waiting for take off -> give also the state information relative to the actual drone position ecc
            state_t = np.asarray(
                state_t
            )  #create an array that is the state at time t : errorX,errorY, Terminal
            total_reward = [0]  #initialize reward
            terminal = [False]  #flag relative to the training phase
            step = 0  #number of iteration inside eac episode
            episode_check = episode_check + 1

            while (terminal[0] == False):
                if step > 200:
                    break
                print(
                    '############################################################'
                )
                step = step + 1

                action_t = np.zeros(
                    [1, actions_dim]
                )  #create a zero array with the same dimesion of the number of actions

                action_t_initial = actor.model.predict(
                    state_t.reshape(1, state_t.shape[0]))

                action_t[0][0] = action_t_initial[0][0]
                action_t[0][1] = action_t_initial[0][1]

                #Step, Apply action in the environment and reach a new state
                state_t1, reward_t, terminal = env._step(action_t[0], step)
                state_t1 = np.asarray(state_t1)

                total_reward[0] = total_reward[0] + reward_t[0]
                state_t = state_t1

                #Evaluate distance error fro print purpose
                error_x = (goal_position[0] - state_t[0])
                error_y = (goal_position[1] - state_t[1])
                distance_error = math.sqrt(error_x * error_x +
                                           error_y * error_y)
                print(
                    'episode: {}, step: {},distance_error: {}, total_reward :{}'
                    .format(ep, step, distance_error, total_reward[0]))
            episode_reward.append(total_reward)
            print('episode: {}, total_ep_reward :{}'.format(
                ep, np.mean(episode_reward)))
            if (save_stats):

                episode.append(ep)
                mean_reward.append(np.mean(episode_reward))
                std_reward.append(np.std(episode_reward))
                ep_reward.append(total_reward[0])
                distance.append(distance_error)

                if (episode_check == desired_checking_episode):

                    ep_reward_mat = np.asarray(ep_reward)
                    episode_mat = np.asarray([episode])
                    distance_mat = np.asarray(distance)
                    mean_reward_mat = np.asarray(mean_reward)
                    std_reward_mat = np.asarray(std_reward)
                    episode_mat = np.resize(episode_mat, [ep, 1])

                    episode_name = load_path + '/Test_Statistics/%d_test_episode.csv' % (
                        ep)
                    episode_reward_name = load_path + '/Test_Statistics/%d_test_reward.csv' % (
                        ep)
                    distance_name = load_path + '/Test_Statistics/%d_test_distance.csv' % (
                        ep)
                    mean_reward_name = load_path + '/Test_Statistics/%d_test_mean_reward.csv' % (
                        ep)
                    std_reward_name = load_path + '/Test_Statistics/%d_test_std_reward.csv' % (
                        ep)
                    np.savetxt(
                        episode_name, episode_mat, delimiter=","
                    )  #Nel post processing in matlab importare il vettore episode su asse x e fare plot con reward e distance su asse y
                    np.savetxt(episode_reward_name,
                               ep_reward_mat,
                               delimiter=",")
                    np.savetxt(mean_reward_name,
                               mean_reward_mat,
                               delimiter=",")
                    np.savetxt(std_reward_name, std_reward_mat, delimiter=",")
                    np.savetxt(distance_name, distance_mat, delimiter=",")

                    print('Statistics saved succesfully in directory:',
                          load_path, '/Test_Statistics/')
                    episode_check = 0
Пример #2
0
def start_training(goal_position):
    debug = True
    env = Environment(
        debug, goal_position
    )  #Put here all teh function needed for the interaction with the env

    observ_dim = env.num_states
    actions_dim = env.num_actions

    #Define buffer size and dimension
    buffer_size = 5000
    miniBatch_size = 32

    #Define Hyperparameters values
    gamma = 0.98  #learning parameter --> discount factor: model the fact that future reward are worth less than immediate reward
    #MQ value factor, if settled near 1 means tha learning is quickly
    tau = 0.001  # neural networks updating

    #training parameters

    explore = 10000
    max_episode = 5000
    max_steps_in_ep = 10000
    reward = 0

    done = False
    epsilon = 0.9  #exploration exploitation value
    indicator = 0

    plot_reward = False
    save_stats = True
    #Create Empty array for Plotting VAriables
    ep_reward = []
    episode = []
    distance = []

    distance_step = []
    step_reward = []

    #Define goal pos only for print purpose
    distance_error = []
    goal_position = [2.0, 3.0]
    episode_check = 0
    desired_checking_episode = 10
    #If running on RDS uncomment this part
    #Tensorflow GPU optimization
    #    config = tf.ConfigProto()
    #    config.gpu_options.allow_growth = True
    #    sess = tf.Session(config=config)
    #    from keras import backend as K
    #    K.set_session(sess)
    #
    #Say to tensorflow to run on CPU
    config = tf.ConfigProto(device_count={'GPU': 0})
    sess = tf.Session(config=config)
    K.set_session(sess)

    #Define the actor, critic Network and Buffer

    actor = ActorNetwork(env, sess)
    critic = CriticNetwork(env, sess)
    replay_buffer = ReplayBuffer()
    saved_path = '/home/parallels/catkin_ws/src/deep_drone/src/Data_Saved'  #/Model_Weights_saved'
    save_directory = os.path.join(os.getcwd(), saved_path)

    try:
        actor.model.load_weights(
            "/home/parallels/catkin_ws/src/deep_drone/src/Data_Saved/Actor_weights/499_actor_weights.h5"
        )
        actor.model_target.load_weights(
            "/home/parallels/catkin_ws/src/deep_drone/src/Data_Saved/Actor_weights/499_actor_weights.h5"
        )
        critic.model.load_weights(
            '/home/parallels/catkin_ws/src/deep_drone/src/Data_Saved/Critic_weights/499_critic_model.h5'
        )
        critic.model_target.load_weights(
            "/home/parallels/catkin_ws/src/deep_drone/src/Data_Saved/Critic_weights/499_critic_model.h5"
        )

        #critic.model_target.load_weights("/home/parallels/catkin_ws/src/deep_drone/src/Data_Saved/Actor_weights/219_critic_weights.h5")
        print("WEIGHTS LOAD CORRECTLY")
    except:
        print("ERR: WEIGHTS LOAD UNCORRECTLY")

    if not os.path.isdir(
            save_directory):  #return true if path is in an existing directory
        os.makedirs(save_directory)
    os.chdir(save_directory)

    #plot graphs settings
    if (plot_reward):
        plt.ion()  #turn the interactive mode on
        plt.title('Training Curve')
        plt.xlabel('Episodes')
        plt.ylabel('Total Reward')
        plt.grid()

        plt.ion()
        plt.title('Distance Error')
        plt.xlabel('Episodes')
        plt.ylabel('Cartesian Error')
        plt.grid()
#Principal Training LOOP
    for ep in range(500, max_episode):
        #receive initial observation state
        state_t = env._reset(
        )  #reset environment ---> waiting for take off -> give also the state information relative to the actual drone position ecc
        state_t = np.asarray(
            state_t
        )  #create an array that is the state at time t : errorX,errorY, Terminal
        total_reward = [0]  #initialize reward
        terminal = [False]  #flag relative to the training phase
        step = 0  #number of iteration inside eac episode
        episode_check = episode_check + 1
        while (terminal[0] == False):
            if step > 200:  #200:
                break  # exit from the main loop

            step = step + 1

            #            if debug:
            #                print('###############################')
            #print('step: {}'.format(step))
            print(
                '############################################################')
            loss = 0
            epsilon -= 1.0 / explore  #define the expolre exploit probabilities

            action_t = np.zeros(
                [1, actions_dim]
            )  #create a zero array with the same dimesion of the number of actions
            noise_t = np.zeros([1, actions_dim])  #noise array

            #the current action is selected according to current policy and exploration noise
            #The action is predicted from the actor network without noise

            action_t_initial = actor.model.predict(
                state_t.reshape(1, state_t.shape[0])
            )  #state_t.reshape(1, state_t.shape[0])) #make prediction given the state input,shape gives the dimension of the vector.
            #print('action_t_initial', action_t_initial)

            #adding noise to the action predicted
            noise_t[0][0] = OUhlenbeck_noise(epsilon, action_t_initial[0][0])
            noise_t[0][1] = OUhlenbeck_noise(epsilon, action_t_initial[0][1])
            #noise_t[0][2] = OUhlenbeck_noise(epsilon,action_t_initial[0][2])

            action_t[0][0] = action_t_initial[0][0] + noise_t[0][0]
            action_t[0][1] = action_t_initial[0][1] + noise_t[0][1]

            #Step, Apply action in the environment and reach a new state
            state_t1, reward_t, terminal = env._step(action_t[0], step)
            #print('state_t1 : {}'.format(state_t1))

            state_t1 = np.asarray(state_t1)  #create array of the new state
            #Now the sequence state_t, actions, reward, state_t1 must be add to the replay buffer experience
            replay_buffer.add_experience(state_t, action_t[0], reward_t,
                                         state_t1, terminal)

            #Sample a new experience (set of sate, action, state1, reward, terminal) from batch
            mini_batch = replay_buffer.take_experience()

            states_buff = np.asarray([i[0] for i in mini_batch])
            actions_buff = np.asarray([i[1] for i in mini_batch])
            reward_buff = np.asarray([i[2] for i in mini_batch])
            state_new_buff = np.asarray([i[3] for i in mini_batch])
            terminal_buff = np.asarray([i[4] for i in mini_batch])
            #istantiate a y_target vector which must be of the same dimesion of the length of the mini batch
            #y_target = np.asarray([i[1] for i in mini_batch]) #it is only to have the array of the desired dimension

            #Predic an action from Actor Network given the state_new_buff from mini_batch
            action_new_buff = actor.model_target.predict(state_new_buff)

            #Take the prediction from the Critic network about possible Q target relatives to the new_state and action taken from mini batch
            Q_target_predicted = critic.model_target.predict(
                [state_new_buff, action_new_buff])
            #            print('Q_target_predicted', Q_target_predicted)
            #            print('reward_buff', reward_buff)
            #Update the target of the Q value evaluating the BElmann Equation
            y_target = []
            for j in range(len(mini_batch)):

                if terminal_buff[j]:
                    #y_target[j] =  reward_buff[j]
                    y_target.append(reward_buff[j])
                else:

                    y_target.append(
                        reward_buff[j] + gamma * Q_target_predicted[j]
                    )  #it append every time an array and create a sort of list

            #i resize all in order to obtain an array with 1 column and many rows as the dimension of the batch
            y_target = np.resize(y_target, [len(mini_batch), 1])

            #Evaluate the loss error utilizing the model.train_on_batch and update the weights of the critic
            #having as target the y_target evaluated from the belmann equation
            loss = loss + critic.model.train_on_batch(
                [states_buff, actions_buff],
                y_target)  # L = 1/N * sum(y_target - Q(si,ai|theta^Q)^2)

            #The actor policy is updated using the sampled policy gradient
            ############ see https://yanpanlau.github.io/2016/10/11/Torcs-Keras.html for the full expalantion
            #An action is predicted taking the states from the buffer. The action predicted will be used to evaluate the increasing of the critic_gradient
            action_for_grad = actor.model.predict(states_buff)
            #The actor network is trained computing the gradient of the critic network repect to actions.
            #This because the actor network must be trained to follow the maximum gradient increasing direction of the critic network that represent in fact the q network.
            #Like in Q learning, in the Q table, you follow tha action that increase the Q value. Sa me choose here, only different is that instead of having a value
            #to follow we have the gradient of the Critic NEtwork
            critic_gradient = critic.gradients(states_buff, action_for_grad)
            #The actor network is trained having as input the states from which the critic gradient is computed and as target the critic_gradient itself.
            #The goal of the actor networ is to output actions that goes in the direction of the gradient and every time maximize it
            actor.actor_train(states_buff, critic_gradient)
            #The last two rows are done in order to updates the target network
            #theta^Q = tau*theta^Q +(1- tau)*theta^Q'
            actor.target_net_train()
            critic.target_net_train()

            #Evaluate distance error fro print purpose
            error_x = (goal_position[0] - state_t[0])
            error_y = (goal_position[1] - state_t[1])
            distance_error = math.sqrt(error_x * error_x + error_y * error_y)

            #Update Total Reward
            #print('reward_t', reward_t)

            if not reward_t[0]:
                reward_t[0] = -100 * distance_error

            total_reward[0] = total_reward[0] + reward_t[0]

            #The new state becomes the actual state
            state_t = state_t1

            #### Save distance and reward for each step only for pllotting purpose
            distance_step.append(distance_error)
            step_reward.append(reward_t[0])
            if (terminal[0] == True or step == 200):
                distance_step_mat = np.asarray(distance_step)

                step_reward_mat = np.asarray(step_reward)

                distance_step_name = 'Statistics/Step_Statistics/%d_distance_step.csv' % (
                    ep)
                step_reward_name = 'Statistics/Step_Statistics/%d_step_reward.csv' % (
                    ep)

                np.savetxt(
                    distance_step_name, distance_step_mat, delimiter=","
                )  #Nel post processing in matlab importare il vettore episode su asse x e fare plot con reward e distance su asse y
                np.savetxt(step_reward_name, step_reward_mat, delimiter=",")
                distance_step_mat = []
                step_reward_mat = []
                distance_step = []
                step_reward = []

            #Save Model and Weights every 50 episodes as a checkpoint
            print(
                'episode: {}, steps: {}, tot_rewards: {}, terminal: {}'.format(
                    ep, step, total_reward, terminal))

            print('distance_error:{}, pos_x: {}, pos_y: {}'.format(
                distance_error, state_t[0], state_t[1]))

            #if ((step+1)%10 == 0):
        if (episode_check == desired_checking_episode):
            #save Model
            action_model_name = 'Actor_weights/%d_actor_model.h5' % (ep)
            critic_model_name = 'Critic_weights/%d_critic_model.h5' % (ep)
            save_path = os.path.join(save_directory, action_model_name)
            actor.model.save(action_model_name)  #True if you want to overwrite
            critic.model.save(critic_model_name)
            print('Model Saved in path: %s' % save_directory)

            #Save Weights
            model_ext = ".h5"
            model_ext2 = ".json"
            action_save_weights_name = 'Actor_weights/%d_actor_weights' % (ep)
            actor.model.save_weights(action_save_weights_name + model_ext,
                                     overwrite=True)  #Save Weights
            with open(action_save_weights_name + model_ext2, "w") as outfile:
                json.dump(actor.model.to_json(),
                          outfile)  #save Model Archutecture, not weights

            critic_save_weights_name = 'critic_weights/ %d_critic_weights' % (
                ep)
            critic.model.save_weights(critic_save_weights_name + model_ext,
                                      overwrite=True)
            with open(critic_save_weights_name + model_ext2, "w") as outfile:
                json.dump(critic.model.to_json(), outfile)

            print('Weights Saved in path: %s' % save_directory)

        #######################
        #Save Statistics
        if (save_stats):

            episode.append(ep)
            ep_reward.append(total_reward[0])
            distance.append(distance_error)

            if (episode_check == desired_checking_episode):

                ep_reward_mat = np.asarray(ep_reward)
                episode_mat = np.asarray([episode])
                distance_mat = np.asarray(distance)

                episode_mat = np.resize(episode_mat, [ep, 1])

                episode_name = 'Statistics/%d_episode.csv' % (ep)
                episode_reward_name = 'Statistics/%d_reward.csv' % (ep)
                distance_name = 'Statistics/%d_distance.csv' % (ep)
                np.savetxt(
                    episode_name, episode_mat, delimiter=","
                )  #Nel post processing in matlab importare il vettore episode su asse x e fare plot con reward e distance su asse y
                np.savetxt(episode_reward_name, ep_reward_mat, delimiter=",")
                np.savetxt(distance_name, distance_mat, delimiter=",")
                episode_check = 0