def get_task(task_id):
     if task_id == 1:
         test_case_id = 'task1_test'
         return {
             'time_limit':
             600,
             'testcases': [{
                 'id': test_case_id,
                 'env': construct_task1_env(),
                 'runs': 1,
                 't_max': 50
             }]
         }
     elif task_id == 2:
         tcs = [('t2_tmax50', 50), ('t2_tmax40', 40)]
         return {
             'time_limit':
             600,
             'testcases': [{
                 'id': tc,
                 'env': construct_task2_env(),
                 'runs': 300,
                 't_max': t_max
             } for tc, t_max in tcs]
         }
     else:
         raise NotImplementedError
示例#2
0
    def test(agent, env, runs=1000, t_max=100):
        rewards = []
        fail = []
        for run in range(runs):
            env = construct_task2_env(random_seed=run)
            state = env.reset()
            agent_init = {'fast_downward_path': FAST_DOWNWARD_PATH, 'agent_speed_range': (-3,-1), 'gamma' : 1}
            agent.initialize(**agent_init)
            episode_rewards = 0.0
            for t in range(t_max):
                action = agent.step(state)   
                next_state, reward, done, info = env.step(action)
                full_state = {
                    'state': state, 'action': action, 'reward': reward, 'next_state': next_state, 
                    'done': done, 'info': info
                }
                agent.update(**full_state)
                (agent_lane, agent_x) = get_agent_pos(next_state)
                state = next_state
                episode_rewards += reward
                if done:
                    break
            if episode_rewards == 0:
                fail.append(run)
            rewards.append(episode_rewards)
            print(run, episode_rewards, t)
        avg_rewards = sum(rewards)/len(rewards)

        print("{} run(s) avg rewards : {:.3f}".format(runs, avg_rewards))
        print("Fail: " + str(fail))
        return avg_rewards
示例#3
0
    def test(agent, env, runs=1000, t_max=100):
        rewards = []
        for run in range(runs):
            env = construct_task2_env(run)
            state = env.reset()
            agent_init = {'agent_speed_range': (-3, -1), 'gamma': 1}
            agent.initialize(**agent_init)
            episode_rewards = 0.0
            for t in range(t_max):
                action = agent.step(state)
                next_state, reward, done, info = env.step(action)
                full_state = {
                    'state': state,
                    'action': action,
                    'reward': reward,
                    'next_state': next_state,
                    'done': done,
                    'info': info
                }
                agent.update(**full_state)
                state = next_state
                episode_rewards += reward
                if done:
                    break
            rewards.append(episode_rewards)
        avg_rewards = sum(rewards) / len(rewards)

        return avg_rewards
示例#4
0
 def get_task():
     tcs = [('task_2_tmax50', 50), ('task_2_tmax40', 40)]
     return {
         'time_limit':
         600,
         'testcases': [{
             'id': tc,
             'env': construct_task2_env(),
             'runs': 300,
             't_max': t_max
         } for tc, t_max in tcs]
     }
示例#5
0
def get_cars(state):
    return torch.Tensor(state[0][0:9])


def get_trails(state):
    return torch.Tensor(state[3][0:9])


if __name__ == '__main__':
    print('Initializing device and model...')
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model = RTrailNetwork().to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    print('Initializing environment...')
    env = construct_task2_env()
    env.reset()
    history = []

    print('Training...')

    input_cartrails = []
    target_trails = []

    iterations = 0
    while True:
        next_state, reward, done, info = env.step(4)
        if not done:
            history.append((get_cars(next_state).to(device),
                            get_trails(next_state).to(device)))
            continue
示例#6
0
        elapsed_time = time.time() - start_time

        print('Point:', point)

        for t, remarks in [(0.4, 'fast'), (0.6, 'safe'), (0.8, 'dangerous'), (1.0, 'time limit exceeded')]:
            if elapsed_time < task['time_limit'] * t:
                print("Local runtime: {} seconds --- {}".format(elapsed_time, remarks))
                print("WARNING: do note that this might not reflect the runtime on the server.")
                break

    def get_task():
        tcs = [('task_2_tmax50', 50), ('task_2_tmax40', 40)]
        return {
            'time_limit': 600,
            'testcases': [{ 'id': tc, 'env': construct_task2_env(), 'runs': 300, 't_max': t_max } for tc, t_max in tcs]
        }

    task = get_task()

    import argparse

    parser = argparse.ArgumentParser(description='Train and test DQN agent.')
    parser.add_argument('--train', dest='train', action='store_true', help='train the agent')
    args = parser.parse_args()

    if args.train:
        model = train(ConvDQN, construct_task2_env())
        save_model(model)
    else:
        timed_test(task)
def train(model_class, env):
    '''
    Train a model of instance `model_class` on environment `env` (`GridDrivingEnv`).

    It runs the model for `max_episodes` times to collect experiences (`Transition`)
    and store it in the `ReplayBuffer`. It collects an experience by selecting an action
    using the `model.act` function and apply it to the environment, through `env.step`.
    After every episode, it will train the model for `train_steps` times using the
    `optimize` function.

    Output: `model`: the trained model.
    '''

    # Initialize model and target network
    model = model_class(env.world.tensor_space().shape,
                        env.action_space.n).to(device)
    target = model_class(env.world.tensor_space().shape,
                         env.action_space.n).to(device)
    target.load_state_dict(model.state_dict())
    target.eval()

    # Initialize replay buffer
    memory = ReplayBuffer()

    print(model)

    # Initialize rewards, losses, and optimizer
    rewards = []
    losses = []
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    numiters = 15
    explorationParam = 1.
    random_seed = 10
    # mcts = MonteCarloTreeSearch(env=env, numiters=numiters, explorationParam=1., random_seed=random_seed)

    for episode in range(max_episodes):
        epsilon = compute_epsilon(episode)
        state = env.reset()
        episode_rewards = 0.0

        for t in range(t_max):
            # Model takes action
            # state = GridWorldState(state, is_done=env.done)
            state_tensor = np.copy(env.world.tensor_state)

            # print('state:', state)
            liststate = []
            relevantstate = []
            for set in state[0]:
                liststate.append(set)
            # self.env.render()
            for cars in liststate:
                if cars.lane == state.agent.lane - 1:
                    relevantstate.append(cars)
            positionx = []
            if len(relevantstate) > 0:
                relevantspeed = relevantstate[0].speed_range[0] * -1

            for relevantcar in relevantstate:
                positionx.append(relevantcar.position.x)
                # positionx[1].append(relevantcar.speed_range[0])
            agentpos = state.agent.position.x

            relevantpos = []

            for pos in positionx:
                if ((pos - agentpos >= 0) &
                    (pos - agentpos <= relevantspeed)) | (
                        ((49 + pos) - agentpos >= 0) &
                        ((49 + pos) - agentpos <= relevantspeed)):
                    relevantpos.append(pos)
                # if pos - agentpos < 0 & agentpos - pos <= relevantspeed:

            backtrack = []
            relevantsamelane = []

            for cars in liststate:
                if cars.lane == state.agent.lane:
                    relevantsamelane.append(cars)
            samelaneposx = []

            for relevantcarsamelane in relevantsamelane:
                samelaneposx.append(relevantcarsamelane.position.x)
                # positionx[1].append(relevantcar.speed_range[0])
            # agentpos = state.agent.position.x

            relevantsamelanepos = []
            relevantsamelanespeed = relevantsamelane[0].speed_range[0] * -1

            onestephazard = 0
            frontline = []
            for pos2 in samelaneposx:
                if ((pos2 - agentpos >= 0) &
                    (pos2 - agentpos <= relevantsamelanespeed)) | (
                        ((49 + pos2) - agentpos >= 0) &
                        ((49 + pos2) - agentpos <= relevantsamelanespeed)):
                    relevantsamelanepos.append(pos2)
                if ((pos2 - agentpos) < 0) & ((agentpos - pos2) <= 2):
                    onestephazard = 1

            actionnum = 4  # forward-1
            if (state.agent.position.y == 0) & (onestephazard == 0):
                actionnum = 3
                # print("cond1")
            elif (len(relevantpos) == 0) & (state.agent.position.y != 0):
                actionnum = 0  # up
                # print("cond2")
            elif (len(relevantsamelanepos) >
                  0) & (onestephazard != 1) & (state.agent.position.y != 0):
                actionnum = 3
                # print("cond3")
                if (relevantsamelanespeed > 2):
                    actionnum = 2
            elif (len(relevantsamelanepos) > 0) & (onestephazard != 1):
                actionnum = 3
                # print("cond4")
            elif (relevantspeed == 1):
                actionnum = 3
                # print("cond5")

            if state.agent.position.x == 1:
                actionnum = 4
            # elif len(relevantpos) != 0:
            #     action = 3

            # action = mcts.buildTreeAndReturnBestAction(initialState=state)
            # print(actionnum)

            action = env.actions[actionnum]
            # done = env.step(state=deepcopy(state.state), action=action)[2]
            # action = torch.from_numpy(action).float().unsqueeze(0).to(device)

            # action = model.act(state, epsilon)

            # print(action)
            # env.render()
            # Apply the action to the environment
            next_state, reward, done, info = env.step(state=deepcopy(state),
                                                      action=action)
            # env.render()
            # Save transition to replay buffer
            next_state_tensor = np.copy(env.world.tensor_state)
            memory.push(
                Transition(state_tensor, [env.actions.index(action)], [reward],
                           next_state_tensor, [done]))

            state = next_state
            episode_rewards += reward
            if done:
                # print("episode done"+ str(episode_rewards))
                break
        rewards.append(episode_rewards)

        # Train the model if memory is sufficient
        if len(memory) > min_buffer:
            if np.mean(rewards[print_interval:]) < 0.0001:
                print('Bad initialization. Please restart the training.')
                exit()
            for i in range(train_steps):
                loss = optimize(model, target, memory, optimizer)
                losses.append(loss.item())
        # increment()

        # Update target network every once in a while
        if episode % target_update == 0:
            target.load_state_dict(model.state_dict())

        if episode % print_interval == 0 and episode > 0:
            print(
                "[Episode {}]\tavg rewards : {:.3f},\tavg loss: : {:.6f},\tbuffer size : {},\tepsilon : {:.1f}%"
                .format(episode, np.mean(rewards[print_interval:]),
                        np.mean(losses[print_interval * 10:]), len(memory),
                        epsilon * 100))
            if ((episode) % 200) == 0:
                save_model(model)
                model = get_model()
                test(model, construct_task2_env(), max_episodes=10)
    return model
    parser = argparse.ArgumentParser(description='Train and test DQN agent.')
    parser.add_argument('--train',
                        dest='train',
                        action='store_true',
                        help='train the agent')
    args = parser.parse_args()

    def get_task():
        tcs = [('t2_tmax50', 50), ('t2_tmax40', 40)]
        return {
            'time_limit':
            600,
            'testcases': [{
                'id': tc,
                'env': construct_task2_env(),
                'runs': 300,
                't_max': t_max
            } for tc, t_max in tcs]
        }

    task = get_task()
    # timed_test(task)
    # env = get_env()

    if args.train:
        model = train(ConvDQN, construct_task2_env())
        save_model(model)
    else:
        model = get_model()
    test(model, construct_task2_env(), max_episodes=10)