def populate_replay_memory(replay_memory,init_size,start_params): state = reset_all() sg.set_departure_times() # state = sg.get_DTSE() rewardsContainer = RewardsContainer() state = state[0] old_past_state, old_new_state, old_action = None, None, None prev_pen = 0 traffic_light_id = sg.get_all_traffic_light_ids()[0] action = None i = 0 while(len(replay_memory) < init_size): if((i%3) == 0): # if(int(traci.simulation.getCurrentTime()/1000)%time_step == 0): print(ERASE_LINE+"\r Step {} ({})\t Last Action: {}".format(len(replay_memory), init_size, action), end="") sys.stdout.flush() # print(state) state = sg.get_DTSE() prev_rewards = rewardsContainer.get_all_rewards() reward = -prev_rewards[q_estimator.reward_index] traci.simulationStep() sg.set_departure_times() action = traci.trafficlight.getPhase(traffic_light_id)//2 action = action if action<3 else 3 # sg.set_767_action(action) next_state = sg.get_DTSE() # if(old_past_state is not None): # if(old_past_state is None or old_action is None or old_new_state is None or reward is None): # print(old_past_state,old_action,old_new_state,reward) # # print(old_past_state,old_action,old_new_state,reward) # else: old_past_state, old_new_state, old_action = state, next_state, action replay_memory.add_sample(old_past_state,old_action,old_new_state,reward) else: traci.simulationStep() sg.set_departure_times() i+=1 if(traci.simulation.getMinExpectedNumber() is 0): state = reset_all() state = state[0] rewardsContainer = RewardsContainer() i = 0 traci.close()
def validate(q_estimator): try: traci.close() except: 1 # q_estimator.save_model() # q_estimator.load_model() rewardsContainer = RewardsContainer() traci.start([checkBinary('sumo-gui'), "-c", cfg_file, "--tripinfo-output", "tripinfo.xml", "--step-length", str(time_step), "--time-to-teleport", "99999", "-S", "-Q","-W"]) sg.set_departure_times() step = 0 total_rewards = [] while traci.simulation.getMinExpectedNumber() > 0: if(step >= VALIDATION_LENGTH): traci.close() break if((step%3) == 0): state = sg.get_DTSE() best_action = np.argmax(q_estimator.predict(state)) sg.set_767_action(best_action) traci.simulationStep() sg.set_departure_times() rewards = rewardsContainer.get_all_rewards() total_rewards.append(rewards) else: traci.simulationStep() sg.set_departure_times() step+=1 return total_reward/(VALIDATION_LENGTH/3)
def reset_sim(params): try: traci.close() except: print("Errored") traci.start(params) sg.set_departure_times() # if(params[0] == checkBinary('sumo')): print("\033[9A\033[0J\033[1A") return sg.get_DTSE()
def evaluate(q_estimator, policy): try: traci.close() except: 1 # q_estimator.save_model() # q_estimator = pm.load_model(q_estimator.filename) traci.start([checkBinary('sumo-gui'), "-c", cfg_file,#-gui "--tripinfo-output", "tripinfo.xml", "--step-length", str(time_step), "--time-to-teleport", "99999", "-S","-W", "-Q"]) sg.set_departure_times() rewards_object = RewardsContainer() total_rewards = [] step = 0 throughput = 0 while traci.simulation.getMinExpectedNumber() > 0: if(step >= EVALUATION_LENGTH): traci.close() print("\033[4A\033[0J\033[1A") break if(traci.simulation.getTime()%3 == 0): state = sg.get_DTSE() state = torch.FloatTensor(state) prediction = q_estimator(state) # print(prediction) # best_action = np.argmax(prediction.detach().numpy()) best_action = policy(state) sg.set_767_action(best_action) traci.simulationStep() sg.set_departure_times() rewards = rewards_object.get_all_rewards() total_rewards.append(rewards) print(ERASE_LINE+"\r Step {} ({}) \tBest Action: {}\tPrediction Value: {}\tRewards: {}".format(step, EVALUATION_LENGTH,#current_time, best_action, prediction[0][best_action], rewards[q_estimator.reward_index]), end="") sys.stdout.flush() else: traci.simulationStep() sg.set_departure_times() throughput += traci.simulation.getArrivedNumber() step+=1 return np.concatenate((np.mean(np.array(total_rewards),axis=0), [throughput]))
def do_train_epoch(q_estimator, replay_memory, optimizer, policy, n_steps, training_epoch, max_epochs): states = reset_all() state = sg.get_DTSE() sg.set_departure_times() old_past_state, old_new_state, old_action = None,None,None reward, loss, action = 0, 0, 0 g_step = int(q_estimator.get_global_step()) # epsilon = epsilon_params["values"][min(g_step, epsilon_params["steps"]-1)] rewardsContainer = RewardsContainer() for t in range(n_steps):#4800):#t in itertools.count(): if(traci.simulation.getTime()%3 == 0): # If at the step, update the target estimator g_step = q_estimator.get_global_step() # if( g_step % target_update == 0): # pm.polyak_update(from_network=q_estimator,to_network=target_estimator) # target_estimator.global_step += 1 # Print the current step print(ERASE_LINE+"\r Step {:0>4d} ({}) @ Episode {:0>4d}/{}, loss: {:06.3f}, Prev Reward: {:04.4f}, Last Action: {}".format(t, g_step,#current_time, training_epoch+1, max_epochs, loss, reward, action), end="") sys.stdout.flush() # Perform one step and return the values action, next_state, rewards = do_sumo_step(policy, state,rewardsContainer) reward = -rewards[q_estimator.reward_index] # reward+=penalty # Once we have history (t>0) we can add the experience to replay memory if(t>0): replay_memory.add_sample(old_past_state,old_action,old_new_state,-reward) old_past_state, old_new_state, old_action = states, next_state, action # prev_pen = penalty state = next_state q_estimator.global_step += 1 # Run optimization step loss = optimize_model(q_estimator,replay_memory,optimizer) else: traci.simulationStep() sg.set_departure_times() return
def do_sumo_step(policy, state, rewardsContainer): # Choose our action and attempt to implement it action = policy(torch.FloatTensor(state))#,epsilon) sg.set_767_action(action) # Step traci.simulationStep() sg.set_departure_times() # Get our post-action state next_state = sg.get_DTSE() # Update our cars locations and waiting times after the action rewards = rewardsContainer.get_all_rewards() return action, next_state, rewards