def sarsa_nstep_diff_live(W, Nruns):
    print("Running nstep SARSA live")

    for run in range(Nruns):
        print("Run " + str(run + 1))

        sim_environment.start_new_run(run)
        curr_s = initial_state_generate()

        t = 0
        while True:
            intersection = t % 4
            if intersection == 3:
                a_space = [13, 14, 15]
            else:
                a_space = [
                    4 * intersection + 1, 4 * intersection + 2,
                    4 * intersection + 3, 4 * intersection + 4
                ]

            a = epsilon_greedy_a(0, a_space, curr_s, W)

            env_param = sim_environment.take_action(a)
            next_s = env_param['next_state']
            r = env_param['rwd']
            if r == 1000:
                print("End of simulation at t = " + str(t))
                break

            curr_s = next_s
            t += 1
    return
def static_signalling(Nruns):
    print("Running Static signalling")

    for run in range(Nruns):
        print("Run " + str(run + 1))
        sim_environment.start_new_run(run)
        initial_state_generate()
        curr_a = 1  # cyclic test
        t = 0
        counter = [1, 0, 0, 0]
        while True:
            next_intersection = (t + 1) % 4
            if next_intersection == 3:
                a_space = [13, 14, 15]
            else:
                a_space = [
                    4 * next_intersection + 1, 4 * next_intersection + 2,
                    4 * next_intersection + 3, 4 * next_intersection + 4
                ]
            next_a = a_space[counter[next_intersection]]

            if counter[next_intersection] != len(a_space) - 1:
                counter[next_intersection] += 1
            else:
                counter[next_intersection] = 0

            env_param = sim_environment.take_action(curr_a)
            r = env_param['rwd']
            if r == 1000:
                print("End of simulation at t = " + str(t))
                break
            curr_a = next_a
            t += 1
    return
示例#3
0
def qr_dqn_live(load):

    print("Running QR-DQN Live")

    # delete any existing images
    if platform.system() == 'Windows':
        os.system("del .\img\*.png")
    elif platform.system() == 'Linux':
        os.system("rm .\img\*.png")

    if load:
        model = Network(len_state=STATE_LEN, num_quant=NUM_QUANTS, num_actions=NUM_ACTIONS)
        model.load_state_dict(torch.load(TMPATH))
        model.eval()
    else:
        model = Network(len_state=STATE_LEN, num_quant=NUM_QUANTS, num_actions=NUM_ACTIONS)
        model.load_state_dict(Z.state_dict())
        model.eval()

    t = 0
    sim_environment.start_new_run(0)
    state = initial_state_generate()
    plt.show()
    plt.ion()
    while True:
        plt.clf()
        plt.title('step = %s' % t)

        intersection = t % 4
        if intersection == 3:
            a_space = [12, 13, 14]
        else:
            a_space = [4 * intersection, 4 * intersection + 1, 4 * intersection + 2,
                       4 * intersection + 3]

        action = model.select_action(torch.Tensor([state]), a_space, 0)

        observ = sim_environment.take_action(action + 1)
        state = observ['next_state']
        reward = observ['rwd']
        done = 1 if reward == -100 else 0
        t += 1

        Zval = model(torch.Tensor([state])).detach().numpy()
        for i in range(NUM_ACTIONS):
            x, y = get_plot(Zval[0][i])
            plt.plot(x, y, label='%s Q=%.1f' % (i + 1, Zval[0][i].mean()))
            plt.legend(bbox_to_anchor=(1.1, 1.1), ncol=NUM_ACTIONS, prop={'size': 3})

        if done: break

        plt.savefig('./img/%s.png' % t)
        plt.pause(0.001)

    plt.close()

    print("Steps = ", t)

    return
示例#4
0
def qr_dqn_live_noplots(Nruns, load):

    print("Running QR-DQN Live (no plots)")

    if load:
        model = Network(len_state=STATE_LEN, num_quant=NUM_QUANTS, num_actions=NUM_ACTIONS)
        model.load_state_dict(torch.load(TMPATH))
        model.eval()
    else:
        model = Network(len_state=STATE_LEN, num_quant=NUM_QUANTS, num_actions=NUM_ACTIONS)
        model.load_state_dict(Z.state_dict())
        model.eval()

    for run in range(Nruns):
        t = 0
        sim_environment.start_new_run(run)
        state = initial_state_generate()
        while True:
            intersection = t % 4
            if intersection == 3:
                a_space = [12, 13, 14]
            else:
                a_space = [4 * intersection, 4 * intersection + 1, 4 * intersection + 2,
                           4 * intersection + 3]

            action = model.select_action(torch.Tensor([state]), a_space, 0)

            observ = sim_environment.take_action(action + 1)
            state = observ['next_state']
            reward = observ['rwd']
            done = 1 if reward == -100 else 0
            t += 1

            if done: break

        print("Steps = ", t)

    return
def lqf(Nruns):

    print("Running LQF")

    for run in range(Nruns):
        print("Run " + str(run + 1))
        sim_environment.start_new_run(run)
        initial_state_generate()
        curr_a = random.randint(1, 4)
        t = 0
        while True:
            next_intersection = (t + 1) % 4
            env_param = sim_environment.take_action(curr_a)
            next_s = env_param['next_state']
            r = env_param['rwd']
            if r == -100:
                print("End of simulation at t = " + str(t))
                break

            if next_intersection == 3:
                a_space = [13, 14, 15]
            else:
                a_space = [
                    4 * next_intersection + 1, 4 * next_intersection + 2,
                    4 * next_intersection + 3, 4 * next_intersection + 4
                ]

            q_next = []
            for a_temp in a_space:
                q_next.append(next_s[a_temp - 1])
            q_max = max(q_next)
            q_max_index = [i for i, j in enumerate(q_next) if j == q_max]
            rand_greedy_q = np.random.choice(q_max_index)
            next_a = a_space[rand_greedy_q]
            curr_a = next_a
            t += 1
    return
示例#6
0
def qr_dqn_train(Nruns):

    print("Running QR-DQN Training")

    # Quantiles
    tau = torch.Tensor((2 * np.arange(NUM_QUANTS) + 1) / (2.0 * NUM_QUANTS)).view(1, -1)

    logger = Logger('q-net', fmt={'loss': '.5f'})

    steps_done = 0
    running_reward = 0
    for run in range(Nruns):
        t = 0
        sum_reward = 0.0
        memory = ReplayMemory(REPLAY_MEM_SIZE)      # Initialize Replay buffer
        sim_environment.start_new_run(run)
        state = initial_state_generate()
        while True:
            intersection = t % 4
            if intersection == 3:
                a_space = [12, 13, 14]
            else:
                a_space = [4 * intersection, 4 * intersection + 1, 4 * intersection + 2,
                           4 * intersection + 3]

            action = Z.select_action(torch.Tensor([state]), a_space, calc_epsilon(steps_done))

            observ = sim_environment.take_action(action + 1)
            next_state = observ['next_state']
            reward = observ['rwd']
            done = 1 if reward == -100 else 0
            steps_done += 1
            t += 1

            if not done:
                memory.push(state, action, next_state, reward, float(done))
                sum_reward += reward

            if len(memory) < BATCH_SIZE:
                state = next_state
                continue

            states, actions, rewards, next_states, dones = memory.sample(BATCH_SIZE)

            theta = Z(states)[np.arange(BATCH_SIZE), actions]

            Znext = Ztgt(next_states).detach()
            Qnext_sa = Znext.mean(2)
            anext_max = torch.zeros([BATCH_SIZE], dtype=torch.long)
            for i in range(BATCH_SIZE):
                next_aspace = calc_next_aspace(int(actions[i]))
                temp = Qnext_sa[i, :]
                anext_max[i] = temp[next_aspace].max(0)[1] + next_aspace[0]

            Znext_max = Znext[np.arange(BATCH_SIZE), anext_max]
            Ttheta = rewards + GAMMA * (1 - dones) * Znext_max

            diff = Ttheta.t().unsqueeze(-1) - theta
            loss = huber(diff) * (tau - (diff.detach() < 0).float()).abs()
            loss = loss.mean()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            state = next_state

            if steps_done % NN_SYNC_FREQ == 0:
                Ztgt.load_state_dict(Z.state_dict())

            if done:
                running_reward = sum_reward if not running_reward else 0.2*sum_reward + 0.8*running_reward
                logger.add(run + 1, steps=t, running_reward=running_reward, loss=loss.data.numpy())
                logger.iter_info()
                break

    torch.save(Z.state_dict(), TMPATH)

    return
示例#7
0
def initial_state_generate():
    for j in range(np.random.choice([4, 8, 12, 16, 20])):
        env_dict = sim_environment.take_action(0)
    return env_dict['next_state']
def sarsa_nstep_diff_train(n, c, epsilon, Nruns):
    print("Running nstep SARSA training")

    buff_len = n + 1
    weight = np.zeros([S_LEN * A_LEN + 1, 1])
    avg_reward = 0
    dl_counter = 0

    for run in range(Nruns):
        print("Run " + str(run + 1))

        r_arr = np.zeros(n, dtype=int)
        a_arr = np.zeros(buff_len, dtype=int)
        s_arr = []
        alpha = 0.1 / (run + 1)
        beta = c * alpha
        e = math.exp(-run)
        #Start new run
        sim_environment.start_new_run(run)
        curr_s = initial_state_generate()
        curr_a = random.randint(1, 4)

        a_arr[0] = curr_a
        s_arr.insert(0, curr_s)
        t = 0
        while True:
            next_intersection = (t + 1) % 4
            #signals corresponding to that intersection
            if next_intersection == 3:
                a_space = [13, 14, 15]
            else:
                a_space = [
                    4 * next_intersection + 1, 4 * next_intersection + 2,
                    4 * next_intersection + 3, 4 * next_intersection + 4
                ]
            #take action on intersection
            env_param = sim_environment.take_action(curr_a)
            #if all the traffic has left the simulation, sim_environment.py returns 1000 reward
            r = env_param['rwd']
            if r == 1000:
                print("End of simulation at t = " + str(t))
                break

            next_s = env_param['next_state']
            r_arr[t % n] = r
            #chose next action epsilon greedily
            next_a = epsilon_greedy_a(e, a_space, next_s, weight[:, 0])
            #store the next state and next action
            s_arr.insert((t + 1) % (n + 1), next_s)
            a_arr[(t + 1) % (n + 1)] = next_a

            tau = t - n + 1
            #n-step SARSA Algo from Richard S. Sutton's Reinforcement Learning book
            if tau >= 0:
                q_tau_n = q_est(s_arr[(tau + n) % (n + 1)],
                                a_arr[(tau + n) % (n + 1)], weight[:, 0])
                q_tau = q_est(s_arr[tau % (n + 1)], a_arr[tau % (n + 1)],
                              weight[:, 0])
                do_error = sum(r_arr) - n * avg_reward + q_tau_n - q_tau
                avg_reward = avg_reward + beta * do_error
                phi_s_a_tau = phi(s_arr[tau % (n + 1)], a_arr[tau % (n + 1)])
                weight[:, 0] = weight[:, 0] + alpha * do_error * np.transpose(
                    phi_s_a_tau)
            curr_a = next_a
            t += 1

    W = weight[:, 0]
    return W
示例#9
0
def sarsa_nstep_diff_train(n, c, epsilon, Nruns):

    print("Running nstep SARSA training")

    buff_len = n + 1
    r_arr = np.zeros(n, dtype=int)
    a_arr = np.zeros(buff_len, dtype=int)
    s_arr = []
    weight = np.zeros([S_LEN * A_LEN + 1, 1])

    for run in range(Nruns):
        print("Run " + str(run + 1))
        avg_reward = 10  # initialize avg reward
        e = epsilon - run * (epsilon / Nruns)
        if e < 0.4:
            e = 0.4
        sim_environment.start_new_run(run)
        curr_s = initial_state_generate()
        curr_a = random.randint(1, 4)
        a_arr[0] = curr_a
        s_arr.insert(0, curr_s)
        t = 0
        while True:
            next_intersection = (t + 1) % 4
            if next_intersection == 3:
                a_space = [13, 14, 15]
            else:
                a_space = [
                    4 * next_intersection + 1, 4 * next_intersection + 2,
                    4 * next_intersection + 3, 4 * next_intersection + 4
                ]

            alpha = 1 / (math.ceil((t + 1) / 10))
            beta = c * alpha

            env_param = sim_environment.take_action(curr_a)
            r = env_param['rwd']
            if r == -100:
                print("Simulation time", t)  # for test
                break

            next_s = env_param['next_state']
            r_arr[t % n] = r
            next_a = epsilon_greedy_a(e, a_space, next_s, weight[:, 0])
            s_arr.insert((t + 1) % (n + 1), next_s)
            a_arr[(t + 1) % (n + 1)] = next_a

            tau = t - n + 1
            if tau >= 0:
                q_tau_n = q_est(s_arr[(tau + n) % (n + 1)],
                                a_arr[(tau + n) % (n + 1)], weight[:, 0])
                q_tau = q_est(s_arr[tau % (n + 1)], a_arr[tau % (n + 1)],
                              weight[:, 0])
                do_error = sum(r_arr) - n * avg_reward + q_tau_n - q_tau
                avg_reward = avg_reward + beta * do_error
                phi_s_a_tau = phi(s_arr[tau % (n + 1)], a_arr[tau % (n + 1)])
                weight[:, 0] = weight[:, 0] + alpha * do_error * np.transpose(
                    phi_s_a_tau)
            curr_a = next_a
            t += 1
    W = weight[:, 0]
    return W