Exemplos de CartPole.CartPole em Python

Linguagem de programação: Python

Espaço para nome / nome do pacote: env

Classe / Tipo: CartPole

Método / Função: CartPole

Exemplos em hotexamples.com: 2

CartPole.CartPole em Python - 2 exemplos encontrados. Esses são os exemplos do mundo real mais bem avaliados de env.CartPole.CartPole em Python extraídos de projetos de código aberto. Você pode avaliar os exemplos para nos ajudar a melhorar a qualidade deles.

Métodos Frequentes

Exibir Ocultar

env(4)

CartPole(2)

get_state(2)

simulate(2)

TfEnv(1)

plot_cart(1)

show_cart(1)

Métodos Frequentes

env (4)

CartPole (2)

get_state (2)

simulate (2)

TfEnv (1)

plot_cart (1)

show_cart (1)

Exemplo n.º 1

0

Exibir arquivo

Arquivo: p06_cartpole.py Projeto: Virusdoll/Stanford.CS229.2018.autumn.ps

def main(plot=True): # Seed the randomness of the simulation so this outputs the same thing each time seed = 0 np.random.seed(seed) # Simulation parameters pause_time = 0.0001 min_trial_length_to_start_display = 100 display_started = min_trial_length_to_start_display == 0 NUM_STATES = 163 GAMMA = 0.995 TOLERANCE = 0.01 NO_LEARNING_THRESHOLD = 20 # Time cycle of the simulation time = 0 # These variables perform bookkeeping (how many cycles was the pole # balanced for before it fell). Useful for plotting learning curves. time_steps_to_failure = [] num_failures = 0 time_at_start_of_current_trial = 0 # You should reach convergence well before this max_failures = 500 # Initialize a cart pole cart_pole = CartPole(Physics()) # Starting `state_tuple` is (0, 0, 0, 0) # x, x_dot, theta, theta_dot represents the actual continuous state vector x, x_dot, theta, theta_dot = 0.0, 0.0, 0.0, 0.0 state_tuple = (x, x_dot, theta, theta_dot) # `state` is the number given to this state, you only need to consider # this representation of the state state = cart_pole.get_state(state_tuple) # if min_trial_length_to_start_display == 0 or display_started == 1: # cart_pole.show_cart(state_tuple, pause_time) mdp_data = initialize_mdp_data(NUM_STATES) # This is the criterion to end the simulation. # You should change it to terminate when the previous # 'NO_LEARNING_THRESHOLD' consecutive value function computations all # converged within one value function iteration. Intuitively, it seems # like there will be little learning after this, so end the simulation # here, and say the overall algorithm has converged. consecutive_no_learning_trials = 0 while consecutive_no_learning_trials < NO_LEARNING_THRESHOLD: action = choose_action(state, mdp_data) # Get the next state by simulating the dynamics state_tuple = cart_pole.simulate(action, state_tuple) # x, x_dot, theta, theta_dot = state_tuple # Increment simulation time time = time + 1 # Get the state number corresponding to new state vector new_state = cart_pole.get_state(state_tuple) # if display_started == 1: # cart_pole.show_cart(state_tuple, pause_time) # reward function to use - do not change this! if new_state == NUM_STATES - 1: R = -1 else: R = 0 update_mdp_transition_counts_reward_counts(mdp_data, state, action, new_state, R) # Recompute MDP model whenever pole falls # Compute the value function V for the new model if new_state == NUM_STATES - 1: update_mdp_transition_probs_reward(mdp_data) converged_in_one_iteration = update_mdp_value(mdp_data, TOLERANCE, GAMMA) if converged_in_one_iteration: consecutive_no_learning_trials = consecutive_no_learning_trials + 1 else: consecutive_no_learning_trials = 0 # Do NOT change this code: Controls the simulation, and handles the case # when the pole fell and the state must be reinitialized. if new_state == NUM_STATES - 1: num_failures += 1 if num_failures >= max_failures: break print('[INFO] Failure number {}'.format(num_failures)) time_steps_to_failure.append(time - time_at_start_of_current_trial) # time_steps_to_failure[num_failures] = time - time_at_start_of_current_trial time_at_start_of_current_trial = time if time_steps_to_failure[num_failures - 1] > min_trial_length_to_start_display: display_started = 1 # Reinitialize state # x = 0.0 x = -1.1 + np.random.uniform() * 2.2 x_dot, theta, theta_dot = 0.0, 0.0, 0.0 state_tuple = (x, x_dot, theta, theta_dot) state = cart_pole.get_state(state_tuple) else: state = new_state if plot: # plot the learning curve (time balanced vs. trial) log_tstf = np.log(np.array(time_steps_to_failure)) plt.plot(np.arange(len(time_steps_to_failure)), log_tstf, 'k') window = 30 w = np.array([1/window for _ in range(window)]) weights = lfilter(w, 1, log_tstf) x = np.arange(window//2, len(log_tstf) - window//2) plt.plot(x, weights[window:len(log_tstf)], 'r--') plt.xlabel('Num failures') plt.ylabel('Log of num steps to failure') plt.title('seed = {}'.format(seed)) plt.savefig('output/control_{}.png'.format(seed)) return np.array(time_steps_to_failure)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: main.py Projeto: mjyoo2/python_study

from stable_baselines import DQN from env import CartPole from stable_baselines.deepq.policies import LnMlpPolicy if __name__ == '__main__': env = CartPole() model = DQN(LnMlpPolicy, env) print('start') model.learn(total_timesteps=10000)