def make_animation(state_log, input_log): fig = plt.figure() ax = plt.axes() ax.set_aspect('equal') plt.axis(xmin = -1.8, xmax = 1.8, ymin = -1.2, ymax = 1.2) ip = InvertedPendulum(); frames = [] for (state, input) in zip(state_log, input_log): ip.state = state ip.force = input frame = ip.draw(ax) x, xdot, a, adot = np.hsplit(ip.state, 4) frame.append(plt.text(0.5, -0.5,'x=%.5f' % x)) frame.append(plt.text(0.5, -0.6,'x\'=%.5f' % xdot)) frame.append(plt.text(0.5, -0.7,'a=%.5f' % a)) frame.append(plt.text(0.5, -0.8,'a\'=%.5f' % adot)) frame.append(plt.text(0.5, -0.9,'u=%.5f' % ip.force)) frame.append(plt.text(0.5, -1.0,'E=%.5f' % ip.total_energy())) frames.append(frame) anim = animation.ArtistAnimation(fig, frames, interval = 1000 / SIM_FPS) #anim.save("output.mp4") #anim.save("output.gif", writer='imagemagick') plt.show()
def run_simulation(initial_state, controller, duration): sim_step = 1 / SIM_FPS sim_iteration = int(duration * SIM_FPS) state_log = [] input_log = [] ip = InvertedPendulum() ip.state = initial_state for i in range(sim_iteration): ip.force = controller.process(ip.state) state_log.append(np.copy(ip.state)) input_log.append(np.copy(ip.force)) ip.step_rk4(sim_step) return (state_log, input_log)
def main(): env = InvertedPendulum(pole_mass=2.0, cart_mass=8.0, pole_lenght=0.5, delta_t=0.1) # Define the state arrays for velocity and position tot_action = 3 # Three possible actions tot_bins = 12 # the value used to discretize the space velocity_state_array = np.linspace(-np.pi, np.pi, num=tot_bins-1, endpoint=False) position_state_array = np.linspace(-np.pi/2.0, np.pi/2.0, num=tot_bins-1, endpoint=False) #Random policy policy_matrix = np.random.randint(low=0, high=tot_action, size=(tot_bins,tot_bins)) print("Policy Matrix:") print_policy(policy_matrix) state_action_matrix = np.zeros((tot_action, tot_bins*tot_bins)) #init with 1.0e-10 to avoid division by zero running_mean_matrix = np.full((tot_action, tot_bins*tot_bins), 1.0e-10) gamma = 0.999 tot_episode = 500000 # 500k epsilon_start = 0.99 # those are the values for epsilon decay epsilon_stop = 0.1 epsilon_decay_step = 10000 print_episode = 500 # print every... movie_episode = 20000 # movie saved every... reward_list = list() step_list = list() for episode in range(tot_episode): epsilon = return_decayed_value(epsilon_start, epsilon_stop, episode, decay_step=epsilon_decay_step) #Starting a new episode episode_list = list() #Reset and return the first observation and reward observation = env.reset(exploring_starts=True) observation = (np.digitize(observation[1], velocity_state_array), np.digitize(observation[0], position_state_array)) #action = np.random.choice(4, 1) #action = policy_matrix[observation[0], observation[1]] #episode_list.append((observation, action, reward)) is_starting = True cumulated_reward = 0 for step in range(100): #Take the action from the action matrix action = return_epsilon_greedy_action(policy_matrix, observation, epsilon=epsilon) #If the episode just started then it is #necessary to choose a random action (exploring starts) if(is_starting): action = np.random.randint(0, tot_action) is_starting = False #if(episode % print_episode == 0): # print("Step: " + str(step) + "; Action: " + str(action) + "; Angle: " + str(observation[0]) + "; Velocity: " + str(observation[1])) #Move one step in the environment and get obs and reward new_observation, reward, done = env.step(action) new_observation = (np.digitize(new_observation[1], velocity_state_array), np.digitize(new_observation[0], position_state_array)) #Append the visit in the episode list episode_list.append((observation, action, reward)) observation = new_observation cumulated_reward += reward if done: break #The episode is finished, now estimating the utilities counter = 0 #Checkup to identify if it is the first visit to a state checkup_matrix = np.zeros((tot_action, tot_bins*tot_bins)) #This cycle is the implementation of First-Visit MC. #For each state stored in the episode list check if it #is the rist visit and then estimate the return. for visit in episode_list: observation = visit[0] action = visit[1] col = observation[1] + (observation[0]*tot_bins) row = action if(checkup_matrix[row, col] == 0): return_value = get_return(episode_list[counter:], gamma) running_mean_matrix[row, col] += 1 state_action_matrix[row, col] += return_value checkup_matrix[row, col] = 1 counter += 1 #Policy Update policy_matrix = update_policy(episode_list, policy_matrix, state_action_matrix/running_mean_matrix, tot_bins) # Store the data for statistics reward_list.append(cumulated_reward) step_list.append(step) # Printing utilities if(episode % print_episode == 0): print("") print("Episode: " + str(episode+1)) print("Epsilon: " + str(epsilon)) print("Episode steps: " + str(step+1)) print("Cumulated Reward: " + str(cumulated_reward)) print("Policy matrix: ") print_policy(policy_matrix) if(episode % movie_episode == 0): print("Saving the reward plot in: ./reward.png") plot_curve(reward_list, filepath="./reward.png", x_label="Episode", y_label="Reward", x_range=(0, len(reward_list)), y_range=(-0.1,100), color="red", kernel_size=500, alpha=0.4, grid=True) print("Saving the step plot in: ./step.png") plot_curve(step_list, filepath="./step.png", x_label="Episode", y_label="Steps", x_range=(0, len(step_list)), y_range=(-0.1,100), color="blue", kernel_size=500, alpha=0.4, grid=True) print("Saving the gif in: ./inverted_pendulum.gif") env.render(file_path='./inverted_pendulum.gif', mode='gif') print("Complete!") print("Policy matrix after " + str(tot_episode) + " episodes:") print_policy(policy_matrix)
# The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. from inverted_pendulum import InvertedPendulum import random my_pole = InvertedPendulum(pole_mass=2.0, cart_mass=8.0, pole_lenght=0.5, delta_t=0.1) cumulated_reward = 0 print("Starting random agent...") for step in range(100): action = random.randint(a=0, b=2) observation, reward, done = my_pole.step(action) cumulated_reward += reward print("Step: " + str(step)) print("Action: " + str(action)) print("Angle: " + str(observation[0])) print("Velocity: " + str(observation[1])) print("Reward: " + str(reward)) print("") if done: break print("Finished after: " + str(step + 1) + " steps")
def main(): env = InvertedPendulum(pole_mass=2.0, cart_mass=8.0, pole_lenght=0.5, delta_t=0.1) # Define the state arrays for velocity and position tot_action = 3 # Three possible actions tot_bins = 12 # the value used to discretize the space velocity_state_array = np.linspace(-np.pi, np.pi, num=tot_bins - 1, endpoint=False) position_state_array = np.linspace(-np.pi / 2.0, np.pi / 2.0, num=tot_bins - 1, endpoint=False) #Random policy policy_matrix = np.random.randint(low=0, high=tot_action, size=(tot_bins, tot_bins)) print("Policy Matrix:") print_policy(policy_matrix) state_action_matrix = np.zeros((tot_action, tot_bins * tot_bins)) #init with 1.0e-10 to avoid division by zero running_mean_matrix = np.full((tot_action, tot_bins * tot_bins), 1.0e-10) gamma = 0.999 tot_episode = 500000 # 500k epsilon_start = 0.99 # those are the values for epsilon decay epsilon_stop = 0.1 epsilon_decay_step = 10000 print_episode = 500 # print every... movie_episode = 20000 # movie saved every... reward_list = list() step_list = list() for episode in range(tot_episode): epsilon = return_decayed_value(epsilon_start, epsilon_stop, episode, decay_step=epsilon_decay_step) #Starting a new episode episode_list = list() #Reset and return the first observation and reward observation = env.reset(exploring_starts=True) observation = (np.digitize(observation[1], velocity_state_array), np.digitize(observation[0], position_state_array)) #action = np.random.choice(4, 1) #action = policy_matrix[observation[0], observation[1]] #episode_list.append((observation, action, reward)) is_starting = True cumulated_reward = 0 for step in range(100): #Take the action from the action matrix action = return_epsilon_greedy_action(policy_matrix, observation, epsilon=epsilon) #If the episode just started then it is #necessary to choose a random action (exploring starts) if (is_starting): action = np.random.randint(0, tot_action) is_starting = False #if(episode % print_episode == 0): # print("Step: " + str(step) + "; Action: " + str(action) + "; Angle: " + str(observation[0]) + "; Velocity: " + str(observation[1])) #Move one step in the environment and get obs and reward new_observation, reward, done = env.step(action) new_observation = (np.digitize(new_observation[1], velocity_state_array), np.digitize(new_observation[0], position_state_array)) #Append the visit in the episode list episode_list.append((observation, action, reward)) observation = new_observation cumulated_reward += reward if done: break #The episode is finished, now estimating the utilities counter = 0 #Checkup to identify if it is the first visit to a state checkup_matrix = np.zeros((tot_action, tot_bins * tot_bins)) #This cycle is the implementation of First-Visit MC. #For each state stored in the episode list check if it #is the rist visit and then estimate the return. for visit in episode_list: observation = visit[0] action = visit[1] col = observation[1] + (observation[0] * tot_bins) row = action if (checkup_matrix[row, col] == 0): return_value = get_return(episode_list[counter:], gamma) running_mean_matrix[row, col] += 1 state_action_matrix[row, col] += return_value checkup_matrix[row, col] = 1 counter += 1 #Policy Update policy_matrix = update_policy( episode_list, policy_matrix, state_action_matrix / running_mean_matrix, tot_bins) # Store the data for statistics reward_list.append(cumulated_reward) step_list.append(step) # Printing utilities if (episode % print_episode == 0): print("") print("Episode: " + str(episode + 1)) print("Epsilon: " + str(epsilon)) print("Episode steps: " + str(step + 1)) print("Cumulated Reward: " + str(cumulated_reward)) print("Policy matrix: ") print_policy(policy_matrix) if (episode % movie_episode == 0): print("Saving the reward plot in: ./reward.png") plot_curve(reward_list, filepath="./reward.png", x_label="Episode", y_label="Reward", x_range=(0, len(reward_list)), y_range=(-0.1, 100), color="red", kernel_size=500, alpha=0.4, grid=True) print("Saving the step plot in: ./step.png") plot_curve(step_list, filepath="./step.png", x_label="Episode", y_label="Steps", x_range=(0, len(step_list)), y_range=(-0.1, 100), color="blue", kernel_size=500, alpha=0.4, grid=True) print("Saving the gif in: ./inverted_pendulum.gif") env.render(file_path='./inverted_pendulum.gif', mode='gif') print("Complete!") print("Policy matrix after " + str(tot_episode) + " episodes:") print_policy(policy_matrix)
# # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. from inverted_pendulum import InvertedPendulum import random my_pole = InvertedPendulum(pole_mass=2.0, cart_mass=8.0, pole_lenght=0.5, delta_t=0.1) cumulated_reward = 0 print("Starting random agent...") for step in range(100): action = random.randint(a=0, b=2) observation, reward, done = my_pole.step(action) cumulated_reward += reward print("Step: " + str(step)) print("Action: " + str(action)) print("Angle: " + str(observation[0])) print("Velocity: " + str(observation[1])) print("Reward: " + str(reward)) print("") if done: break print("Finished after: " + str(step+1) + " steps") print("Cumulated Reward: " + str(cumulated_reward))