def exp(env_name='straight_4lane_obs_cam', lr=1e-4, eps=0.0003125, max_timesteps=25e6, buffer_size=1e6, batch_size=32, exp_t1=1e6, exp_p1=0.1, exp_t2=25e6, exp_p2=0.01, train_freq=4, learning_starts=5e4, target_network_update_freq=1e4, gamma=0.99, num_cpu=50, dist_params={'Vmin': -10, 'Vmax': 10, 'nb_atoms': 51}, convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], #action_res=None ): env = rlsim_env.make(env_name) # env = wrapper_car_racing(env) # frame stack logger.configure(dir=os.path.join('.', datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f"))) # logger.configure() n_action, action_map = discrete_action(env, env_name) # , action_res=action_res) model = distdeepq.models.cnn_to_dist_mlp( convs=convs, # [(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=hiddens, # [512], # n_action=n_action, dueling=False ) act = distdeepq.learn( env, p_dist_func=model, lr=lr, # 1e-4 eps=eps, max_timesteps=int(max_timesteps), # 25M buffer_size=int(buffer_size), # 1M batch_size=int(batch_size), exp_t1=exp_t1, exp_p1=exp_p1, exp_t2=exp_t2, exp_p2=exp_p2, train_freq=train_freq, learning_starts=learning_starts, # 50000 target_network_update_freq=target_network_update_freq, # 10000 gamma=gamma, num_cpu=num_cpu, prioritized_replay=False, dist_params=dist_params, n_action=int(n_action), action_map=action_map ) act.save("car_racing_model.pkl")
import rlsim_env as rl import numpy as np from real_safe_pid import PID_safe n_lane = 2 n_obs = 4 print('making environment') env = rl.make("straight_lane", obs_num=n_obs, nr_lane=n_lane) print('done') pid = PID_safe(n_lane) print('reset environment') s = env.reset() print('done') pid.reset(pid.get_current_lane(s)) done = False ep_num = 0 step_num = 0 reward = 0 successes = [] while True: step_num += 1 if done: if step_num > 498: success = True successes.append(success) step_num = 0 ep_num += 1 s = env.reset() #pid.reset(pid.get_current_lane(s)) done = False
if len(action)==1: action = action[0] angle_cmd = 4*(action[0]-0.5) acc = action[1] pedal_cmd = acc #break_cmd = 2*(0.5-acc) if acc<0.5 else 0 break_cmd = 0 return [angle_cmd,pedal_cmd,break_cmd] # env = gym.make('Pendulum-v0') env = rlsim_env.make('straight_4lane_cam') print 'updated' seed = 0 np.random.seed(seed) random.seed(seed) obs_dim = env.observation_space.shape #.shape[0] act_dim = 2 # env.action_space #.shape[0] policy = Policy(obs_dim, act_dim, epochs=50, convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], lr=3e-4, clip_range=0.2,seed=seed) val_func = Value(obs_dim, epochs=100, convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], lr=1e-3, seed=seed) episode_size = 8 #80 batch_size = 128
if len(action) == 1: action = action[0] angle_cmd = 4 * (action[0] - 0.5) acc = action[1] pedal_cmd = acc #break_cmd = 2*(0.5-acc) if acc<0.5 else 0 break_cmd = 0 return [angle_cmd, pedal_cmd, break_cmd] env = rlsim_env.make('straight_4lane') obs_dim = env.observation_space act_dim = 2 #env.action_space policy = Policy(obs_dim, act_dim, epochs=50, hdim=64, lr=3e-4, clip_range=0.2, seed=seed) val_func = Value(obs_dim, epochs=100, hdim=64, lr=1e-3, seed=seed) policy.restore_graph(0) val_func.restore_graph(0)
accel = action % 3 if steering == 0: angle_cmd = -1.0 elif steering == 1: angle_cmd = 0.0 else: angle_cmd = 1.0 pedal_cmd = 0.5 if accel == 1 else 0 break_cmd = 0.5 if accel == 2 else 0 return [angle_cmd, pedal_cmd, break_cmd] env = rlsim_env.make('straight_4lane_obs') obs_dim = env.observation_space act_dim = 9 #env.action_space mode = ['train', 'test'] cur_mode = 'train' max_t = env.time_limit agent = DQNAgent(obs_dim, act_dim, memory_mode='PER', target_mode='DDQN', policy_mode='argmax', restore=False, net_dir='q_learning2_iter_1000.ckpt' ) # memory_mode='PER',target_mode='DDQN'