Пример #1
0
def make_animation(state_log, input_log):
  fig = plt.figure()
  ax = plt.axes()
  ax.set_aspect('equal')
  plt.axis(xmin = -1.8, xmax = 1.8, ymin = -1.2, ymax = 1.2)
  
  ip = InvertedPendulum();
  
  frames = []
  
  for (state, input) in zip(state_log, input_log):
    ip.state = state
    ip.force = input
    
    frame = ip.draw(ax)
    x, xdot, a, adot = np.hsplit(ip.state, 4)
    frame.append(plt.text(0.5, -0.5,'x=%.5f' % x))
    frame.append(plt.text(0.5, -0.6,'x\'=%.5f' % xdot))
    frame.append(plt.text(0.5, -0.7,'a=%.5f' % a))
    frame.append(plt.text(0.5, -0.8,'a\'=%.5f' % adot))
    frame.append(plt.text(0.5, -0.9,'u=%.5f' % ip.force))
    frame.append(plt.text(0.5, -1.0,'E=%.5f' % ip.total_energy()))
    frames.append(frame)
  
  anim = animation.ArtistAnimation(fig, frames, interval = 1000 / SIM_FPS)
  
  #anim.save("output.mp4")
  #anim.save("output.gif", writer='imagemagick')
  plt.show()
Пример #2
0
def run_simulation(initial_state, controller, duration):
  sim_step = 1 / SIM_FPS
  sim_iteration = int(duration * SIM_FPS)
  
  state_log = []
  input_log = []
  
  ip = InvertedPendulum()
  ip.state = initial_state
  
  for i in range(sim_iteration):
    ip.force = controller.process(ip.state)
    
    state_log.append(np.copy(ip.state))
    input_log.append(np.copy(ip.force))
    
    ip.step_rk4(sim_step)
  
  return (state_log, input_log)
def main():

    env = InvertedPendulum(pole_mass=2.0, cart_mass=8.0, pole_lenght=0.5, delta_t=0.1)

    # Define the state arrays for velocity and position
    tot_action = 3  # Three possible actions
    tot_bins = 12  # the value used to discretize the space
    velocity_state_array = np.linspace(-np.pi, np.pi, num=tot_bins-1, endpoint=False)
    position_state_array = np.linspace(-np.pi/2.0, np.pi/2.0, num=tot_bins-1, endpoint=False)

    #Random policy
    policy_matrix = np.random.randint(low=0, high=tot_action, size=(tot_bins,tot_bins))
    print("Policy Matrix:")
    print_policy(policy_matrix)

    state_action_matrix = np.zeros((tot_action, tot_bins*tot_bins))
    #init with 1.0e-10 to avoid division by zero
    running_mean_matrix = np.full((tot_action, tot_bins*tot_bins), 1.0e-10) 
    gamma = 0.999
    tot_episode = 500000 # 500k
    epsilon_start = 0.99  # those are the values for epsilon decay
    epsilon_stop = 0.1
    epsilon_decay_step = 10000
    print_episode = 500  # print every...
    movie_episode = 20000  # movie saved every...
    reward_list = list()
    step_list = list()

    for episode in range(tot_episode):
        epsilon = return_decayed_value(epsilon_start, epsilon_stop, episode, decay_step=epsilon_decay_step)
        #Starting a new episode
        episode_list = list()
        #Reset and return the first observation and reward
        observation = env.reset(exploring_starts=True)
        observation = (np.digitize(observation[1], velocity_state_array), 
                       np.digitize(observation[0], position_state_array))
        #action = np.random.choice(4, 1)
        #action = policy_matrix[observation[0], observation[1]]
        #episode_list.append((observation, action, reward))
        is_starting = True
        cumulated_reward = 0
        for step in range(100):
            #Take the action from the action matrix
            action = return_epsilon_greedy_action(policy_matrix, observation, epsilon=epsilon)
            #If the episode just started then it is
                #necessary to choose a random action (exploring starts)
            if(is_starting): 
                action = np.random.randint(0, tot_action)
                is_starting = False   
            #if(episode % print_episode == 0):
            #    print("Step: " + str(step) + "; Action: " + str(action) + "; Angle: " + str(observation[0]) + "; Velocity: " + str(observation[1]))   
            #Move one step in the environment and get obs and reward
            new_observation, reward, done = env.step(action)
            new_observation = (np.digitize(new_observation[1], velocity_state_array), 
                               np.digitize(new_observation[0], position_state_array))  
            #Append the visit in the episode list
            episode_list.append((observation, action, reward))
            observation = new_observation
            cumulated_reward += reward
            if done: break
        #The episode is finished, now estimating the utilities
        counter = 0
        #Checkup to identify if it is the first visit to a state
        checkup_matrix = np.zeros((tot_action, tot_bins*tot_bins))
        #This cycle is the implementation of First-Visit MC.
        #For each state stored in the episode list check if it
        #is the rist visit and then estimate the return.
        for visit in episode_list:
            observation = visit[0]
            action = visit[1]
            col = observation[1] + (observation[0]*tot_bins)
            row = action
            if(checkup_matrix[row, col] == 0):
                return_value = get_return(episode_list[counter:], gamma)
                running_mean_matrix[row, col] += 1
                state_action_matrix[row, col] += return_value
                checkup_matrix[row, col] = 1
            counter += 1
        #Policy Update
        policy_matrix = update_policy(episode_list, 
                                      policy_matrix, 
                                      state_action_matrix/running_mean_matrix,
                                      tot_bins)
        # Store the data for statistics
        reward_list.append(cumulated_reward)
        step_list.append(step)
        # Printing utilities
        if(episode % print_episode == 0):
            print("")
            print("Episode: " + str(episode+1))
            print("Epsilon: " + str(epsilon))
            print("Episode steps: " + str(step+1))
            print("Cumulated Reward: " + str(cumulated_reward))
            print("Policy matrix: ") 
            print_policy(policy_matrix)
        if(episode % movie_episode == 0):
            print("Saving the reward plot in: ./reward.png")
            plot_curve(reward_list, filepath="./reward.png", 
                       x_label="Episode", y_label="Reward",
                       x_range=(0, len(reward_list)), y_range=(-0.1,100),
                       color="red", kernel_size=500, 
                       alpha=0.4, grid=True)
            print("Saving the step plot in: ./step.png")
            plot_curve(step_list, filepath="./step.png", 
                       x_label="Episode", y_label="Steps", 
                       x_range=(0, len(step_list)), y_range=(-0.1,100),
                       color="blue", kernel_size=500, 
                       alpha=0.4, grid=True)
            print("Saving the gif in: ./inverted_pendulum.gif")
            env.render(file_path='./inverted_pendulum.gif', mode='gif')
            print("Complete!")

    print("Policy matrix after " + str(tot_episode) + " episodes:")
    print_policy(policy_matrix)
Пример #4
0
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from inverted_pendulum import InvertedPendulum
import random

my_pole = InvertedPendulum(pole_mass=2.0,
                           cart_mass=8.0,
                           pole_lenght=0.5,
                           delta_t=0.1)
cumulated_reward = 0
print("Starting random agent...")
for step in range(100):
    action = random.randint(a=0, b=2)
    observation, reward, done = my_pole.step(action)
    cumulated_reward += reward
    print("Step: " + str(step))
    print("Action: " + str(action))
    print("Angle: " + str(observation[0]))
    print("Velocity: " + str(observation[1]))
    print("Reward: " + str(reward))
    print("")
    if done: break
print("Finished after: " + str(step + 1) + " steps")
def main():

    env = InvertedPendulum(pole_mass=2.0,
                           cart_mass=8.0,
                           pole_lenght=0.5,
                           delta_t=0.1)

    # Define the state arrays for velocity and position
    tot_action = 3  # Three possible actions
    tot_bins = 12  # the value used to discretize the space
    velocity_state_array = np.linspace(-np.pi,
                                       np.pi,
                                       num=tot_bins - 1,
                                       endpoint=False)
    position_state_array = np.linspace(-np.pi / 2.0,
                                       np.pi / 2.0,
                                       num=tot_bins - 1,
                                       endpoint=False)

    #Random policy
    policy_matrix = np.random.randint(low=0,
                                      high=tot_action,
                                      size=(tot_bins, tot_bins))
    print("Policy Matrix:")
    print_policy(policy_matrix)

    state_action_matrix = np.zeros((tot_action, tot_bins * tot_bins))
    #init with 1.0e-10 to avoid division by zero
    running_mean_matrix = np.full((tot_action, tot_bins * tot_bins), 1.0e-10)
    gamma = 0.999
    tot_episode = 500000  # 500k
    epsilon_start = 0.99  # those are the values for epsilon decay
    epsilon_stop = 0.1
    epsilon_decay_step = 10000
    print_episode = 500  # print every...
    movie_episode = 20000  # movie saved every...
    reward_list = list()
    step_list = list()

    for episode in range(tot_episode):
        epsilon = return_decayed_value(epsilon_start,
                                       epsilon_stop,
                                       episode,
                                       decay_step=epsilon_decay_step)
        #Starting a new episode
        episode_list = list()
        #Reset and return the first observation and reward
        observation = env.reset(exploring_starts=True)
        observation = (np.digitize(observation[1], velocity_state_array),
                       np.digitize(observation[0], position_state_array))
        #action = np.random.choice(4, 1)
        #action = policy_matrix[observation[0], observation[1]]
        #episode_list.append((observation, action, reward))
        is_starting = True
        cumulated_reward = 0
        for step in range(100):
            #Take the action from the action matrix
            action = return_epsilon_greedy_action(policy_matrix,
                                                  observation,
                                                  epsilon=epsilon)
            #If the episode just started then it is
            #necessary to choose a random action (exploring starts)
            if (is_starting):
                action = np.random.randint(0, tot_action)
                is_starting = False
            #if(episode % print_episode == 0):
            #    print("Step: " + str(step) + "; Action: " + str(action) + "; Angle: " + str(observation[0]) + "; Velocity: " + str(observation[1]))
            #Move one step in the environment and get obs and reward
            new_observation, reward, done = env.step(action)
            new_observation = (np.digitize(new_observation[1],
                                           velocity_state_array),
                               np.digitize(new_observation[0],
                                           position_state_array))
            #Append the visit in the episode list
            episode_list.append((observation, action, reward))
            observation = new_observation
            cumulated_reward += reward
            if done: break
        #The episode is finished, now estimating the utilities
        counter = 0
        #Checkup to identify if it is the first visit to a state
        checkup_matrix = np.zeros((tot_action, tot_bins * tot_bins))
        #This cycle is the implementation of First-Visit MC.
        #For each state stored in the episode list check if it
        #is the rist visit and then estimate the return.
        for visit in episode_list:
            observation = visit[0]
            action = visit[1]
            col = observation[1] + (observation[0] * tot_bins)
            row = action
            if (checkup_matrix[row, col] == 0):
                return_value = get_return(episode_list[counter:], gamma)
                running_mean_matrix[row, col] += 1
                state_action_matrix[row, col] += return_value
                checkup_matrix[row, col] = 1
            counter += 1
        #Policy Update
        policy_matrix = update_policy(
            episode_list, policy_matrix,
            state_action_matrix / running_mean_matrix, tot_bins)
        # Store the data for statistics
        reward_list.append(cumulated_reward)
        step_list.append(step)
        # Printing utilities
        if (episode % print_episode == 0):
            print("")
            print("Episode: " + str(episode + 1))
            print("Epsilon: " + str(epsilon))
            print("Episode steps: " + str(step + 1))
            print("Cumulated Reward: " + str(cumulated_reward))
            print("Policy matrix: ")
            print_policy(policy_matrix)
        if (episode % movie_episode == 0):
            print("Saving the reward plot in: ./reward.png")
            plot_curve(reward_list,
                       filepath="./reward.png",
                       x_label="Episode",
                       y_label="Reward",
                       x_range=(0, len(reward_list)),
                       y_range=(-0.1, 100),
                       color="red",
                       kernel_size=500,
                       alpha=0.4,
                       grid=True)
            print("Saving the step plot in: ./step.png")
            plot_curve(step_list,
                       filepath="./step.png",
                       x_label="Episode",
                       y_label="Steps",
                       x_range=(0, len(step_list)),
                       y_range=(-0.1, 100),
                       color="blue",
                       kernel_size=500,
                       alpha=0.4,
                       grid=True)
            print("Saving the gif in: ./inverted_pendulum.gif")
            env.render(file_path='./inverted_pendulum.gif', mode='gif')
            print("Complete!")

    print("Policy matrix after " + str(tot_episode) + " episodes:")
    print_policy(policy_matrix)
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from inverted_pendulum import InvertedPendulum
import random

my_pole = InvertedPendulum(pole_mass=2.0, cart_mass=8.0, pole_lenght=0.5, delta_t=0.1)
cumulated_reward = 0
print("Starting random agent...")
for step in range(100):
    action = random.randint(a=0, b=2)
    observation, reward, done = my_pole.step(action)
    cumulated_reward += reward
    print("Step: " + str(step))
    print("Action: " + str(action))
    print("Angle: " + str(observation[0]))
    print("Velocity: " + str(observation[1]))
    print("Reward: " + str(reward))
    print("")
    if done: break
print("Finished after: " + str(step+1) + " steps")
print("Cumulated Reward: " + str(cumulated_reward))