def __init__(self, args, shared_value, share_net): seed = args.seed np.random.seed(seed) torch.manual_seed(seed) self.stop_sign = shared_value[1] self.iteration_counter = shared_value[2] self.iteration = self.iteration_counter.value self.share_net = share_net self.args = args self.env = gym.make(args.env_name) self.device = torch.device("cpu") self.actor = PolicyNet(args).to(self.device) self.Q_net1 = QNet(args).to(self.device) self.Q_net2 = QNet(args).to(self.device) self.actor_share = share_net[4] self.Q_net1_share = share_net[0] self.Q_net2_share = share_net[2] self.log_alpha_share = share_net[-1] self.alpha = np.exp(self.log_alpha_share.detach().item()) if args.alpha == 'auto' else 0 self.evaluation_interval = 50000 self.max_state_num_evaluated_in_an_episode = 500 self.episode_num_to_run = 10 self.iteration_history = [] self.evaluated_Q_mean_history=[] self.true_gamma_return_mean_history=[] # self.n_episodes_info_history = [] self.evaluated_Q_history = [] self.true_gamma_return_history = []
def __init__(self, state_size, action_size, seed, double_agent=False,dueling_agent=False): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed double_agent(bool) : True if we want to use DDQN dueling_agent (bool): True if we want to use Dueling """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.double_agent=double_agent self.dueling_agent=dueling_agent self.qnetwork_local = QNet(state_size, action_size, seed,dueling_agent=dueling_agent).to(device) self.qnetwork_target = QNet(state_size, action_size, seed,dueling_agent=dueling_agent).to(device) self.optimizer = optim.RMSprop(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def __init__(self, args, shared_value, share_net): seed = args.seed np.random.seed(seed) torch.manual_seed(seed) eval_params = { 'obs_size': (160, 100), # screen size of cv2 window 'dt': 0.025, # time interval between two frames 'ego_vehicle_filter': 'vehicle.lincoln*', # filter for defining ego vehicle 'port': int(2000 + 3 * args.num_actors), # connection port 'task_mode': 'Straight', # mode of the task, [random, roundabout (only for Town03)] 'code_mode': 'test', 'max_time_episode': 100, # maximum timesteps per episode 'desired_speed': 15, # desired speed (m/s) 'max_ego_spawn_times': 100, # maximum times to spawn ego vehicle } self.stop_sign = shared_value[1] self.iteration_counter = shared_value[2] self.iteration = self.iteration_counter.value self.share_net = share_net self.args = args self.env = gym.make(args.env_name, params=eval_params) self.device = torch.device("cpu") self.actor = PolicyNet(args).to(self.device) self.Q_net1 = QNet(args).to(self.device) self.Q_net2 = QNet(args).to(self.device) self.actor_share = share_net[4] self.Q_net1_share = share_net[0] self.Q_net2_share = share_net[2] self.log_alpha_share = share_net[-1] self.alpha = np.exp(self.log_alpha_share.detach().item() ) if args.alpha == 'auto' else 0 self.evaluation_interval = 20000 self.max_state_num_evaluated_in_an_episode = 50 # 500 self.episode_num_evaluation = 5 self.episode_num_test = 5 self.time = time.time() self.list_of_n_episode_rewards_history = [] self.time_history = [] self.alpha_history = [] self.average_return_with_diff_base_history = [] self.average_reward_history = [] self.iteration_history = [] self.evaluated_Q_mean_history = [] self.evaluated_Q_std_history = [] self.true_gamma_return_mean_history = [] self.policy_entropy_history = [] self.a_std_history = [] self.a_abs_history = []
def __init__(self, args,shared_value): super(Simulation, self).__init__() seed = args.seed np.random.seed(seed) torch.manual_seed(seed) simu_params = { 'number_of_vehicles': 0, 'number_of_walkers': 0, 'obs_size': (160, 100), # screen size of cv2 window 'dt': 0.025, # time interval between two frames 'ego_vehicle_filter': 'vehicle.lincoln*', # filter for defining ego vehicle 'port': 2000, # connection port 'task_mode': 'Straight', # mode of the task, [random, roundabout (only for Town03)] 'code_mode': 'test', 'max_time_episode': 100, # maximum timesteps per episode 'desired_speed': 15, # desired speed (m/s) 'max_ego_spawn_times': 100, # maximum times to spawn ego vehicle } self.stop_sign = shared_value[1] self.args = args self.env = gym.make(args.env_name, params=simu_params) self.device = torch.device("cpu") self.load_index = self.args.max_train # self.load_index = 40000 self.actor = PolicyNet(args).to(self.device) self.actor.load_state_dict(torch.load('./'+self.args.env_name+'/method_' + str(self.args.method) + '/model/policy1_' + str(self.load_index) + '.pkl',map_location='cpu')) self.Q_net1 = QNet(args).to(self.device) self.Q_net1.load_state_dict(torch.load('./'+self.args.env_name+'/method_' + str(self.args.method) + '/model/Q1_' + str(self.load_index) + '.pkl',map_location='cpu')) if self.args.double_Q: self.Q_net2 = QNet(args).to(self.device) self.Q_net2.load_state_dict(torch.load('./'+self.args.env_name+'/method_' + str(self.args.method) + '/model/Q2_' + str(self.load_index) + '.pkl',map_location='cpu')) self.test_step = 0 self.save_interval = 10000 self.iteration = 0 self.reward_history = [] self.entropy_history = [] self.epoch_history =[] self.done_history = [] self.Q_real_history = [] self.Q_history =[] self.Q_std_history = []
def __init__(self, args, shared_value): super(Simulation, self).__init__() seed = args.seed np.random.seed(seed) torch.manual_seed(seed) self.stop_sign = shared_value[1] self.args = args self.env = gym.make(args.env_name) self.device = torch.device("cpu") self.load_index = self.args.max_train self.actor = PolicyNet(args).to(self.device) self.actor.load_state_dict( torch.load('./' + self.args.env_name + '/method_' + str(self.args.method) + '/model/policy1_' + str(self.load_index) + '.pkl', map_location='cpu')) self.Q_net1 = QNet(args).to(self.device) self.Q_net1.load_state_dict( torch.load('./' + self.args.env_name + '/method_' + str(self.args.method) + '/model/Q1_' + str(self.load_index) + '.pkl', map_location='cpu')) if self.args.double_Q: self.Q_net2 = QNet(args).to(self.device) self.Q_net2.load_state_dict( torch.load('./' + self.args.env_name + '/method_' + str(self.args.method) + '/model/Q2_' + str(self.load_index) + '.pkl', map_location='cpu')) self.test_step = 0 self.save_interval = 10000 self.iteration = 0 self.reward_history = [] self.entropy_history = [] self.epoch_history = [] self.done_history = [] self.Q_real_history = [] self.Q_history = [] self.Q_std_history = []
def __init__(self, args, shared_queue,shared_value): super(Simulation, self).__init__() seed = args.seed np.random.seed(seed) torch.manual_seed(seed) self.policy_test_queue = shared_queue[3] self.stop_sign = shared_value[1] self.args = args self.env = gym.make(args.env_name) self.device = torch.device("cpu") self.load_index = 20000 self.actor = PolicyNet(args.state_dim, args.num_hidden_cell, args.action_high, args.action_low,args.NN_type).to(self.device) self.actor.load_state_dict(torch.load('./data/method_' + str(1) + '/model/policy_' + str(self.load_index) + '.pkl')) self.Q_net1_m0 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device) self.Q_net1_m0.load_state_dict(torch.load('./data/method_' + str(0) + '/model/Q1_' + str(self.load_index) + '.pkl')) self.Q_net1_m1 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device) self.Q_net1_m1.load_state_dict(torch.load('./data/method_' + str(1) + '/model/Q1_' + str(self.load_index) + '.pkl')) self.Q_net2_m1 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device) self.Q_net2_m1.load_state_dict(torch.load('./data/method_' + str(1) + '/model/Q2_' + str(self.load_index) + '.pkl')) self.Q_net1_m2 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device) self.Q_net1_m2.load_state_dict(torch.load('./data/method_' + str(2) + '/model/Q1_' + str(self.load_index) + '.pkl')) self.test_step = 0 self.save_interval = 10000 self.iteration = 0 self.reward_history = [] self.entropy_history = [] self.epoch_history =[] self.done_history = [] self.Q_real_history = [] self.Q_m0_history =[] self.Q_m1_history = [] self.Q_m2_history = [] self.Q_std_m2_history = []
def __init__(self, args, shared_value, share_net): seed = args.seed np.random.seed(seed) torch.manual_seed(seed) self.stop_sign = shared_value[1] self.iteration_counter = shared_value[2] self.iteration = self.iteration_counter.value self.share_net = share_net self.args = args self.env = gym.make(args.env_name) self.device = torch.device("cpu") self.actor = PolicyNet(args).to(self.device) self.Q_net1 = QNet(args).to(self.device) self.Q_net2 = QNet(args).to(self.device) self.actor_share = share_net[4] self.Q_net1_share = share_net[0] self.Q_net2_share = share_net[2] self.log_alpha_share = share_net[-1] self.alpha = np.exp(self.log_alpha_share.detach().item() ) if args.alpha == 'auto' else 0 self.evaluation_interval = 20000 self.max_state_num_evaluated_in_an_episode = 500 self.episode_num_evaluation = 5 self.episode_num_test = 5 self.time = time.time() self.list_of_n_episode_rewards_history = [] self.time_history = [] self.alpha_history = [] self.average_return_with_diff_base_history = [] self.average_reward_history = [] self.iteration_history = [] self.evaluated_Q_mean_history = [] self.evaluated_Q_std_history = [] self.true_gamma_return_mean_history = [] self.policy_entropy_history = [] self.a_std_history = [] self.a_abs_history = []
def __init__(self, args, shared_queue, shared_value, lock, i): super(Actor, self).__init__() self.agent_id = i seed = args.seed + np.int64(self.agent_id) np.random.seed(seed) torch.manual_seed(seed) self.experience_queue = shared_queue[0] self.policy_param_queue = shared_queue[1] self.q_param_queue = shared_queue[2] self.counter = shared_value[0] self.stop_sign = shared_value[1] self.lock = lock self.env = gym.make(args.env_name) self.args = args self.device = torch.device("cpu") self.actor = PolicyNet(args.state_dim, args.num_hidden_cell, args.action_high, args.action_low, args.NN_type).to(self.device) self.Q_net1 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device)
def __init__(self, args, shared_queue, shared_value, share_net, lock, i): super(Actor, self).__init__() self.agent_id = i seed = args.seed + np.int64(self.agent_id) np.random.seed(seed) torch.manual_seed(seed) self.counter = shared_value[0] self.stop_sign = shared_value[1] self.lock = lock self.env = gym.make(args.env_name) self.args = args self.experience_in_queue = [] for i in range(args.num_buffers): self.experience_in_queue.append(shared_queue[0][i]) self.device = torch.device("cpu") self.actor = PolicyNet(args).to(self.device) self.Q_net1 = QNet(args).to(self.device) self.Q_net1_share = share_net[1] self.actor_share = share_net[0]
from __future__ import print_function
def __init__(self, args, shared_queue, shared_value, share_net, share_optimizer, device, lock, i): super(Learner, self).__init__() self.args = args seed = self.args.seed self.init_time = self.args.init_time np.random.seed(seed) torch.manual_seed(seed) self.agent_id = i self.experience_out_queue = [] for i in range(args.num_buffers): self.experience_out_queue.append(shared_queue[1][i]) self.stop_sign = shared_value[1] self.iteration_counter = shared_value[2] self.iteration = self.iteration_counter.value self.device = device if self.device == torch.device("cpu"): self.gpu = False else: self.gpu = True self.lock = lock self.Q_net1_share, self.Q_net1_target_share, self.Q_net2_share, self.Q_net2_target_share, self.actor1_share, \ self.actor1_target_share, self.actor2_share, self.actor2_target_share, self.log_alpha_share = share_net self.Q_net1_optimizer, self.Q_net2_optimizer, self.actor1_optimizer, self.actor2_optimizer, self.alpha_optimizer = share_optimizer self.Q_net1 = QNet(args).to(self.device) self.scheduler_Q_net1 = lr_scheduler.CosineAnnealingLR( self.Q_net1_optimizer, T_max=self.args.decay_T_max, eta_min=self.args.end_lr, last_epoch=-1) self.Q_net1.train() self.Q_net1_target = QNet(args).to(self.device) self.Q_net1_target.train() self.Q_net2 = QNet(args).to(self.device) self.scheduler_Q_net2 = lr_scheduler.CosineAnnealingLR( self.Q_net2_optimizer, T_max=self.args.decay_T_max, eta_min=self.args.end_lr, last_epoch=-1) self.Q_net2.train() self.Q_net2_target = QNet(args).to(self.device) self.Q_net2_target.train() self.actor1 = PolicyNet(args).to(self.device) self.scheduler_actor1 = lr_scheduler.CosineAnnealingLR( self.actor1_optimizer, T_max=self.args.decay_T_max, eta_min=self.args.end_lr, last_epoch=-1) self.actor1.train() self.actor1_target = PolicyNet(args).to(self.device) self.actor1_target.train() self.actor2 = PolicyNet(args).to(self.device) self.scheduler_actor2 = lr_scheduler.CosineAnnealingLR( self.actor2_optimizer, T_max=self.args.decay_T_max, eta_min=self.args.end_lr, last_epoch=-1) self.actor2.train() self.actor2_target = PolicyNet(args).to(self.device) self.actor2_target.train() self.scheduler_alpha = lr_scheduler.CosineAnnealingLR( self.alpha_optimizer, T_max=self.args.decay_T_max, eta_min=self.args.end_lr, last_epoch=-1) if self.args.alpha == 'auto': self.target_entropy = args.target_entropy else: self.alpha = torch.tensor(self.args.alpha)
def main(method): args = built_parser(method=method) env = gym.make(args.env_name) state_dim = env.observation_space.shape action_dim = env.action_space.shape[0] args.state_dim = state_dim args.action_dim = action_dim action_high = env.action_space.high action_low = env.action_space.low args.action_high = action_high.tolist() args.action_low = action_low.tolist() args.seed = np.random.randint(0, 30) args.init_time = time.time() if args.alpha == 'auto' and args.target_entropy == 'auto': delta_a = np.array(args.action_high, dtype=np.float32) - np.array( args.action_low, dtype=np.float32) args.target_entropy = -1 * args.action_dim #+ sum(np.log(delta_a/2)) Q_net1 = QNet(args) Q_net1.train() Q_net1.share_memory() Q_net1_target = QNet(args) Q_net1_target.train() Q_net1_target.share_memory() Q_net2 = QNet(args) Q_net2.train() Q_net2.share_memory() Q_net2_target = QNet(args) Q_net2_target.train() Q_net2_target.share_memory() actor1 = PolicyNet(args) actor1.train() actor1.share_memory() actor1_target = PolicyNet(args) actor1_target.train() actor1_target.share_memory() actor2 = PolicyNet(args) actor2.train() actor2.share_memory() actor2_target = PolicyNet(args) actor2_target.train() actor2_target.share_memory() Q_net1_target.load_state_dict(Q_net1.state_dict()) Q_net2_target.load_state_dict(Q_net2.state_dict()) actor1_target.load_state_dict(actor1.state_dict()) actor2_target.load_state_dict(actor2.state_dict()) Q_net1_optimizer = my_optim.SharedAdam(Q_net1.parameters(), lr=args.critic_lr) Q_net1_optimizer.share_memory() Q_net2_optimizer = my_optim.SharedAdam(Q_net2.parameters(), lr=args.critic_lr) Q_net2_optimizer.share_memory() actor1_optimizer = my_optim.SharedAdam(actor1.parameters(), lr=args.actor_lr) actor1_optimizer.share_memory() actor2_optimizer = my_optim.SharedAdam(actor2.parameters(), lr=args.actor_lr) actor2_optimizer.share_memory() log_alpha = torch.zeros(1, dtype=torch.float32, requires_grad=True) log_alpha.share_memory_() alpha_optimizer = my_optim.SharedAdam([log_alpha], lr=args.alpha_lr) alpha_optimizer.share_memory() share_net = [ Q_net1, Q_net1_target, Q_net2, Q_net2_target, actor1, actor1_target, actor2, actor2_target, log_alpha ] share_optimizer = [ Q_net1_optimizer, Q_net2_optimizer, actor1_optimizer, actor2_optimizer, alpha_optimizer ] experience_in_queue = [] experience_out_queue = [] for i in range(args.num_buffers): experience_in_queue.append(Queue(maxsize=10)) experience_out_queue.append(Queue(maxsize=10)) shared_queue = [experience_in_queue, experience_out_queue] step_counter = mp.Value('i', 0) stop_sign = mp.Value('i', 0) iteration_counter = mp.Value('i', 0) shared_value = [step_counter, stop_sign, iteration_counter] lock = mp.Lock() procs = [] if args.code_model == "train": for i in range(args.num_actors): procs.append( Process(target=actor_agent, args=(args, shared_queue, shared_value, [actor1, Q_net1], lock, i))) for i in range(args.num_buffers): procs.append( Process(target=buffer, args=(args, shared_queue, shared_value, i))) procs.append( Process(target=evaluate_agent, args=(args, shared_value, share_net))) for i in range(args.num_learners): #device = torch.device("cuda") device = torch.device("cpu") procs.append( Process(target=leaner_agent, args=(args, shared_queue, shared_value, share_net, share_optimizer, device, lock, i))) elif args.code_model == "simu": procs.append(Process(target=simu_agent, args=(args, shared_value))) for p in procs: p.start() for p in procs: p.join()
def main(method): params = { 'obs_size': (160, 100), # screen size of cv2 window 'dt': 0.025, # time interval between two frames 'ego_vehicle_filter': 'vehicle.lincoln*', # filter for defining ego vehicle 'port': 2000, # connection port 'task_mode': 'Straight', # mode of the task, [random, roundabout (only for Town03)] 'code_mode': 'train', 'max_time_episode': 100, # maximum timesteps per episode 'desired_speed': 15, # desired speed (m/s) 'max_ego_spawn_times': 100, # maximum times to spawn ego vehicle } args = built_parser(method=method) env = gym.make(args.env_name, params=params) state_dim = env.state_space.shape action_dim = env.action_space.shape[0] args.state_dim = state_dim args.action_dim = action_dim action_high = env.action_space.high action_low = env.action_space.low args.action_high = action_high.tolist() args.action_low = action_low.tolist() args.seed = np.random.randint(0, 30) args.init_time = time.time() num_cpu = mp.cpu_count() print(state_dim, action_dim, action_high, num_cpu) if args.alpha == 'auto' and args.target_entropy == 'auto': delta_a = np.array(args.action_high, dtype=np.float32) - np.array( args.action_low, dtype=np.float32) args.target_entropy = -1 * args.action_dim # + sum(np.log(delta_a/2)) Q_net1 = QNet(args) Q_net1.train() Q_net1.share_memory() Q_net1_target = QNet(args) Q_net1_target.train() Q_net1_target.share_memory() Q_net2 = QNet(args) Q_net2.train() Q_net2.share_memory() Q_net2_target = QNet(args) Q_net2_target.train() Q_net2_target.share_memory() actor1 = PolicyNet(args) print("Network inited") if args.code_model == "eval": actor1.load_state_dict( torch.load('./' + args.env_name + '/method_' + str(args.method) + '/model/policy_' + str(args.max_train) + '.pkl')) actor1.train() actor1.share_memory() actor1_target = PolicyNet(args) actor1_target.train() actor1_target.share_memory() actor2 = PolicyNet(args) actor2.train() actor2.share_memory() actor2_target = PolicyNet(args) actor2_target.train() actor2_target.share_memory() print("Network set") Q_net1_target.load_state_dict(Q_net1.state_dict()) Q_net2_target.load_state_dict(Q_net2.state_dict()) actor1_target.load_state_dict(actor1.state_dict()) actor2_target.load_state_dict(actor2.state_dict()) print("Network loaded!") Q_net1_optimizer = my_optim.SharedAdam(Q_net1.parameters(), lr=args.critic_lr) Q_net1_optimizer.share_memory() Q_net2_optimizer = my_optim.SharedAdam(Q_net2.parameters(), lr=args.critic_lr) Q_net2_optimizer.share_memory() actor1_optimizer = my_optim.SharedAdam(actor1.parameters(), lr=args.actor_lr) actor1_optimizer.share_memory() actor2_optimizer = my_optim.SharedAdam(actor2.parameters(), lr=args.actor_lr) actor2_optimizer.share_memory() log_alpha = torch.zeros(1, dtype=torch.float32, requires_grad=True) log_alpha.share_memory_() alpha_optimizer = my_optim.SharedAdam([log_alpha], lr=args.alpha_lr) alpha_optimizer.share_memory() print("Optimizer done") share_net = [ Q_net1, Q_net1_target, Q_net2, Q_net2_target, actor1, actor1_target, actor2, actor2_target, log_alpha ] share_optimizer = [ Q_net1_optimizer, Q_net2_optimizer, actor1_optimizer, actor2_optimizer, alpha_optimizer ] experience_in_queue = [] experience_out_queue = [] for i in range(args.num_buffers): experience_in_queue.append(Queue(maxsize=10)) experience_out_queue.append(Queue(maxsize=10)) shared_queue = [experience_in_queue, experience_out_queue] step_counter = mp.Value('i', 0) stop_sign = mp.Value('i', 0) iteration_counter = mp.Value('i', 0) shared_value = [step_counter, stop_sign, iteration_counter] lock = mp.Lock() procs = [] if args.code_model == "train": for i in range(args.num_learners): if i % 2 == 0: device = torch.device("cuda:1") else: device = torch.device("cuda:0") # device = torch.device("cpu") procs.append( Process(target=leaner_agent, args=(args, shared_queue, shared_value, share_net, share_optimizer, device, lock, i))) for i in range(args.num_actors): procs.append( Process(target=actor_agent, args=(args, shared_queue, shared_value, [actor1, Q_net1], lock, i))) for i in range(args.num_buffers): procs.append( Process(target=buffer, args=(args, shared_queue, shared_value, i))) procs.append( Process(target=evaluate_agent, args=(args, shared_value, share_net))) elif args.code_model == "simu": procs.append(Process(target=simu_agent, args=(args, shared_value))) for p in procs: p.start() for p in procs: p.join()
def main(): # parameters for the gym_carla environment params = { 'display_size': 256, # screen size of bird-eye render 'obs_size': 128, # screen size of cv2 window 'dt': 0.1, # time interval between two frames 'ego_vehicle_filter': 'vehicle.lincoln*', # filter for defining ego vehicle 'port': 2000, # connection port # 'town': 'Town01', # which town to simulate 'task_mode': 'Straight', # mode of the task, [random, roundabout (only for Town03)] 'code_mode': 'test', 'max_time_episode': 5000, # maximum timesteps per episode 'desired_speed': 8, # desired speed (m/s) 'max_ego_spawn_times': 100, # maximum times to spawn ego vehicle } # Set gym-carla environment env = gym.make('carla-v0', params=params) # load net device = torch.device('cpu') args = Args() args.NN_type actor = PolicyNet(args).to(device) actor.load_state_dict(torch.load('./policy1_500000.pkl',map_location='cpu')) Q_net1 = QNet(args).to(device) Q_net1.load_state_dict(torch.load('./Q1_500000.pkl',map_location='cpu')) obs, info_dict = env.reset() info = info_dict_to_array(info_dict) state_tensor = torch.FloatTensor(obs.copy()).float().to(device) info_tensor = torch.FloatTensor(info.copy()).float().to(device) # print(env.ego.get_location()) tic = time.time() done = False ret = 0 start = carla.Location(x=env.start[0], y=env.start[1], z=0.22) end = carla.Location(x=env.dest[0], y=env.dest[1], z=0.22) if args.NN_type == "CNN": state_tensor = state_tensor.permute(2, 0, 1) while not done: tac = time.time() u, log_prob = actor.get_action(state_tensor.unsqueeze(0), info_tensor.unsqueeze(0), True) u = u.squeeze(0) obs, r, done, info = env.step(u) info = info_dict_to_array(info_dict) state_tensor = torch.FloatTensor(obs.copy()).float().to(device) if args.NN_type == "CNN": state_tensor = state_tensor.permute(2, 0, 1) info_tensor = torch.FloatTensor(info.copy()).float().to(device) ret += r cv2.imshow("camera img", obs) cv2.waitKey(1) # print(info['acceleration_t'].shape) env.world.debug.draw_point(start) env.world.debug.draw_point(end) if done: toc = time.time() print("An episode took %f s" %(toc - tic)) print("total reward is", ret) print("time steps", env.time_step) env.close() env.reset() ret = 0 # print(env.ego.get_location()) done = False
# 1 Cart Velocity -Inf Inf int(check_bound(np.degrees(observation[2]), np.arange(-11, 11, 1))), # 2 Pole Angle -24 deg 24 deg int(check_bound(observation[3], np.arange(-0.88, 0.88, 0.08))) # 3 Pole Velocity At Tip -Inf Inf ] # Create Agent actions = range(env.action_space.n) agent = Agent(None, (25, 25, 25), actions) temp_agent = agent.__copy__() # Create Network net_size = 128 net = QNet(env.observation_space.shape[0], env.action_space.n, net_size, device).to(device) optimizer = optim.Adam(net.parameters(), lr=1e-3) net.train() ok = False guts = 0 i_episode = 0 total = 0 loss = 0 guts_required = 100 guts_print_div = 10 big_data = [[], []] print("Learning...") while not ok: # Agent learning while guts < guts_required: