def populate_replay_memory(replay_memory,init_size,start_params):
    state = reset_all()
    sg.set_departure_times()
    # state = sg.get_DTSE()
    rewardsContainer = RewardsContainer()
    state = state[0]

    old_past_state, old_new_state, old_action = None, None, None
    prev_pen = 0
    traffic_light_id = sg.get_all_traffic_light_ids()[0]
    action = None
    i = 0
    while(len(replay_memory) < init_size):
        if((i%3) == 0):
            # if(int(traci.simulation.getCurrentTime()/1000)%time_step == 0):
            print(ERASE_LINE+"\r Step {} ({})\t Last Action: {}".format(len(replay_memory), init_size, action), end="")
            sys.stdout.flush()
            # print(state)
            state = sg.get_DTSE()
            prev_rewards = rewardsContainer.get_all_rewards()
            reward = -prev_rewards[q_estimator.reward_index]

            traci.simulationStep()
            sg.set_departure_times()

            action = traci.trafficlight.getPhase(traffic_light_id)//2 
            action = action if action<3 else 3
            # sg.set_767_action(action)

            next_state = sg.get_DTSE()
            
            # if(old_past_state is not None):
            #     if(old_past_state is None or old_action is None or old_new_state is None or reward is None):
            #         print(old_past_state,old_action,old_new_state,reward)
            #     # print(old_past_state,old_action,old_new_state,reward)
            #     else:
            old_past_state, old_new_state, old_action = state, next_state, action
            replay_memory.add_sample(old_past_state,old_action,old_new_state,reward)        
        else:
            traci.simulationStep()
            sg.set_departure_times()
        i+=1
        if(traci.simulation.getMinExpectedNumber() is 0):
            state = reset_all()
            state = state[0]
            rewardsContainer = RewardsContainer()
            i = 0


            
    traci.close()
def validate(q_estimator):
    try:
        traci.close()
    except:
        1
    # q_estimator.save_model()
    # q_estimator.load_model()
    rewardsContainer = RewardsContainer()
    traci.start([checkBinary('sumo-gui'), "-c", cfg_file,
                             "--tripinfo-output", "tripinfo.xml",
                             "--step-length", str(time_step),
                             "--time-to-teleport", "99999",
                             "-S",
                             "-Q","-W"])
    sg.set_departure_times()
    step = 0
    total_rewards = []
    while traci.simulation.getMinExpectedNumber() > 0:
        if(step >= VALIDATION_LENGTH): 
            traci.close()
            break
        if((step%3) == 0):
            state = sg.get_DTSE()
        
            best_action = np.argmax(q_estimator.predict(state))
            sg.set_767_action(best_action)
            traci.simulationStep()
            sg.set_departure_times()
            rewards = rewardsContainer.get_all_rewards()
            total_rewards.append(rewards)
        else:
            traci.simulationStep()
            sg.set_departure_times()
        step+=1
    return total_reward/(VALIDATION_LENGTH/3)
def reset_sim(params):
    try:
        traci.close()
    except:
        print("Errored")
    traci.start(params)
    sg.set_departure_times()

    # if(params[0] == checkBinary('sumo')):
    print("\033[9A\033[0J\033[1A")
    
    return sg.get_DTSE()
def evaluate(q_estimator, policy):
    try:
        traci.close()
    except:
        1
    # q_estimator.save_model()
    # q_estimator = pm.load_model(q_estimator.filename)
        
    traci.start([checkBinary('sumo-gui'), "-c", cfg_file,#-gui
                             "--tripinfo-output", "tripinfo.xml",
                             "--step-length", str(time_step),
                             "--time-to-teleport", "99999",
                             "-S","-W",
                             "-Q"])
    sg.set_departure_times()
    rewards_object = RewardsContainer()
    total_rewards = []
    step = 0
    throughput = 0
    while traci.simulation.getMinExpectedNumber() > 0:
        if(step >= EVALUATION_LENGTH): 
            traci.close()
            print("\033[4A\033[0J\033[1A")
            break
        if(traci.simulation.getTime()%3 == 0):
            
            state = sg.get_DTSE()
            state = torch.FloatTensor(state)
            prediction = q_estimator(state)
            # print(prediction)
            # best_action = np.argmax(prediction.detach().numpy())
            best_action = policy(state)
            sg.set_767_action(best_action)
            traci.simulationStep()
            sg.set_departure_times()
            rewards = rewards_object.get_all_rewards()
            total_rewards.append(rewards)

            print(ERASE_LINE+"\r Step {} ({}) \tBest Action: {}\tPrediction Value: {}\tRewards: {}".format(step,
                                                                                EVALUATION_LENGTH,#current_time,
                                                                                best_action,
                                                                                prediction[0][best_action],
                                                                                rewards[q_estimator.reward_index]),
                                                                                end="")
            sys.stdout.flush()
        else:
            traci.simulationStep()
            sg.set_departure_times()
        throughput += traci.simulation.getArrivedNumber()
        step+=1
    return np.concatenate((np.mean(np.array(total_rewards),axis=0), [throughput]))
def do_train_epoch(q_estimator, replay_memory, optimizer, policy, n_steps, training_epoch, max_epochs):
    states = reset_all()
    state = sg.get_DTSE()
    sg.set_departure_times()
        
    old_past_state, old_new_state, old_action = None,None,None
    reward, loss, action = 0, 0, 0

    g_step = int(q_estimator.get_global_step())
    # epsilon = epsilon_params["values"][min(g_step, epsilon_params["steps"]-1)]
    rewardsContainer = RewardsContainer()
    for t in range(n_steps):#4800):#t in itertools.count():
        if(traci.simulation.getTime()%3 == 0):
            
            # If at the step, update the target estimator
            g_step = q_estimator.get_global_step()
            # if( g_step % target_update == 0):
                # pm.polyak_update(from_network=q_estimator,to_network=target_estimator)
                # target_estimator.global_step += 1
            
            # Print the current step
            print(ERASE_LINE+"\r Step {:0>4d} ({}) @ Episode {:0>4d}/{}, loss: {:06.3f}, Prev Reward: {:04.4f}, Last Action: {}".format(t,
                                                                        g_step,#current_time,
                                                                        training_epoch+1,
                                                                        max_epochs,
                                                                        loss,
                                                                        reward,
                                                                        action),
                                                                        end="")
            sys.stdout.flush()
            # Perform one step and return the values
            action, next_state, rewards = do_sumo_step(policy, state,rewardsContainer)
            reward = -rewards[q_estimator.reward_index]
            # reward+=penalty
            
            # Once we have history (t>0) we can add the experience to replay memory
            if(t>0):
                replay_memory.add_sample(old_past_state,old_action,old_new_state,-reward)
            
            old_past_state, old_new_state, old_action = states, next_state, action
            # prev_pen = penalty
            state = next_state
            q_estimator.global_step += 1

            # Run optimization step
            loss = optimize_model(q_estimator,replay_memory,optimizer)
        else:
            traci.simulationStep()
            sg.set_departure_times()
    return
def do_sumo_step(policy, state, rewardsContainer):
    # Choose our action and attempt to implement it
    action = policy(torch.FloatTensor(state))#,epsilon)
    sg.set_767_action(action)
    # Step
    traci.simulationStep()
    sg.set_departure_times()

    # Get our post-action state
    next_state = sg.get_DTSE()

    # Update our cars locations and waiting times after the action
    rewards = rewardsContainer.get_all_rewards()

    return action, next_state, rewards