def get_task(task_id): if task_id == 1: test_case_id = 'task1_test' return { 'time_limit': 600, 'testcases': [{ 'id': test_case_id, 'env': construct_task1_env(), 'runs': 1, 't_max': 50 }] } elif task_id == 2: tcs = [('t2_tmax50', 50), ('t2_tmax40', 40)] return { 'time_limit': 600, 'testcases': [{ 'id': tc, 'env': construct_task2_env(), 'runs': 300, 't_max': t_max } for tc, t_max in tcs] } else: raise NotImplementedError
def test(agent, env, runs=1000, t_max=100): rewards = [] fail = [] for run in range(runs): env = construct_task2_env(random_seed=run) state = env.reset() agent_init = {'fast_downward_path': FAST_DOWNWARD_PATH, 'agent_speed_range': (-3,-1), 'gamma' : 1} agent.initialize(**agent_init) episode_rewards = 0.0 for t in range(t_max): action = agent.step(state) next_state, reward, done, info = env.step(action) full_state = { 'state': state, 'action': action, 'reward': reward, 'next_state': next_state, 'done': done, 'info': info } agent.update(**full_state) (agent_lane, agent_x) = get_agent_pos(next_state) state = next_state episode_rewards += reward if done: break if episode_rewards == 0: fail.append(run) rewards.append(episode_rewards) print(run, episode_rewards, t) avg_rewards = sum(rewards)/len(rewards) print("{} run(s) avg rewards : {:.3f}".format(runs, avg_rewards)) print("Fail: " + str(fail)) return avg_rewards
def test(agent, env, runs=1000, t_max=100): rewards = [] for run in range(runs): env = construct_task2_env(run) state = env.reset() agent_init = {'agent_speed_range': (-3, -1), 'gamma': 1} agent.initialize(**agent_init) episode_rewards = 0.0 for t in range(t_max): action = agent.step(state) next_state, reward, done, info = env.step(action) full_state = { 'state': state, 'action': action, 'reward': reward, 'next_state': next_state, 'done': done, 'info': info } agent.update(**full_state) state = next_state episode_rewards += reward if done: break rewards.append(episode_rewards) avg_rewards = sum(rewards) / len(rewards) return avg_rewards
def get_task(): tcs = [('task_2_tmax50', 50), ('task_2_tmax40', 40)] return { 'time_limit': 600, 'testcases': [{ 'id': tc, 'env': construct_task2_env(), 'runs': 300, 't_max': t_max } for tc, t_max in tcs] }
def get_cars(state): return torch.Tensor(state[0][0:9]) def get_trails(state): return torch.Tensor(state[3][0:9]) if __name__ == '__main__': print('Initializing device and model...') device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = RTrailNetwork().to(device) optimizer = optim.Adam(model.parameters(), lr=learning_rate) print('Initializing environment...') env = construct_task2_env() env.reset() history = [] print('Training...') input_cartrails = [] target_trails = [] iterations = 0 while True: next_state, reward, done, info = env.step(4) if not done: history.append((get_cars(next_state).to(device), get_trails(next_state).to(device))) continue
elapsed_time = time.time() - start_time print('Point:', point) for t, remarks in [(0.4, 'fast'), (0.6, 'safe'), (0.8, 'dangerous'), (1.0, 'time limit exceeded')]: if elapsed_time < task['time_limit'] * t: print("Local runtime: {} seconds --- {}".format(elapsed_time, remarks)) print("WARNING: do note that this might not reflect the runtime on the server.") break def get_task(): tcs = [('task_2_tmax50', 50), ('task_2_tmax40', 40)] return { 'time_limit': 600, 'testcases': [{ 'id': tc, 'env': construct_task2_env(), 'runs': 300, 't_max': t_max } for tc, t_max in tcs] } task = get_task() import argparse parser = argparse.ArgumentParser(description='Train and test DQN agent.') parser.add_argument('--train', dest='train', action='store_true', help='train the agent') args = parser.parse_args() if args.train: model = train(ConvDQN, construct_task2_env()) save_model(model) else: timed_test(task)
def train(model_class, env): ''' Train a model of instance `model_class` on environment `env` (`GridDrivingEnv`). It runs the model for `max_episodes` times to collect experiences (`Transition`) and store it in the `ReplayBuffer`. It collects an experience by selecting an action using the `model.act` function and apply it to the environment, through `env.step`. After every episode, it will train the model for `train_steps` times using the `optimize` function. Output: `model`: the trained model. ''' # Initialize model and target network model = model_class(env.world.tensor_space().shape, env.action_space.n).to(device) target = model_class(env.world.tensor_space().shape, env.action_space.n).to(device) target.load_state_dict(model.state_dict()) target.eval() # Initialize replay buffer memory = ReplayBuffer() print(model) # Initialize rewards, losses, and optimizer rewards = [] losses = [] optimizer = optim.Adam(model.parameters(), lr=learning_rate) numiters = 15 explorationParam = 1. random_seed = 10 # mcts = MonteCarloTreeSearch(env=env, numiters=numiters, explorationParam=1., random_seed=random_seed) for episode in range(max_episodes): epsilon = compute_epsilon(episode) state = env.reset() episode_rewards = 0.0 for t in range(t_max): # Model takes action # state = GridWorldState(state, is_done=env.done) state_tensor = np.copy(env.world.tensor_state) # print('state:', state) liststate = [] relevantstate = [] for set in state[0]: liststate.append(set) # self.env.render() for cars in liststate: if cars.lane == state.agent.lane - 1: relevantstate.append(cars) positionx = [] if len(relevantstate) > 0: relevantspeed = relevantstate[0].speed_range[0] * -1 for relevantcar in relevantstate: positionx.append(relevantcar.position.x) # positionx[1].append(relevantcar.speed_range[0]) agentpos = state.agent.position.x relevantpos = [] for pos in positionx: if ((pos - agentpos >= 0) & (pos - agentpos <= relevantspeed)) | ( ((49 + pos) - agentpos >= 0) & ((49 + pos) - agentpos <= relevantspeed)): relevantpos.append(pos) # if pos - agentpos < 0 & agentpos - pos <= relevantspeed: backtrack = [] relevantsamelane = [] for cars in liststate: if cars.lane == state.agent.lane: relevantsamelane.append(cars) samelaneposx = [] for relevantcarsamelane in relevantsamelane: samelaneposx.append(relevantcarsamelane.position.x) # positionx[1].append(relevantcar.speed_range[0]) # agentpos = state.agent.position.x relevantsamelanepos = [] relevantsamelanespeed = relevantsamelane[0].speed_range[0] * -1 onestephazard = 0 frontline = [] for pos2 in samelaneposx: if ((pos2 - agentpos >= 0) & (pos2 - agentpos <= relevantsamelanespeed)) | ( ((49 + pos2) - agentpos >= 0) & ((49 + pos2) - agentpos <= relevantsamelanespeed)): relevantsamelanepos.append(pos2) if ((pos2 - agentpos) < 0) & ((agentpos - pos2) <= 2): onestephazard = 1 actionnum = 4 # forward-1 if (state.agent.position.y == 0) & (onestephazard == 0): actionnum = 3 # print("cond1") elif (len(relevantpos) == 0) & (state.agent.position.y != 0): actionnum = 0 # up # print("cond2") elif (len(relevantsamelanepos) > 0) & (onestephazard != 1) & (state.agent.position.y != 0): actionnum = 3 # print("cond3") if (relevantsamelanespeed > 2): actionnum = 2 elif (len(relevantsamelanepos) > 0) & (onestephazard != 1): actionnum = 3 # print("cond4") elif (relevantspeed == 1): actionnum = 3 # print("cond5") if state.agent.position.x == 1: actionnum = 4 # elif len(relevantpos) != 0: # action = 3 # action = mcts.buildTreeAndReturnBestAction(initialState=state) # print(actionnum) action = env.actions[actionnum] # done = env.step(state=deepcopy(state.state), action=action)[2] # action = torch.from_numpy(action).float().unsqueeze(0).to(device) # action = model.act(state, epsilon) # print(action) # env.render() # Apply the action to the environment next_state, reward, done, info = env.step(state=deepcopy(state), action=action) # env.render() # Save transition to replay buffer next_state_tensor = np.copy(env.world.tensor_state) memory.push( Transition(state_tensor, [env.actions.index(action)], [reward], next_state_tensor, [done])) state = next_state episode_rewards += reward if done: # print("episode done"+ str(episode_rewards)) break rewards.append(episode_rewards) # Train the model if memory is sufficient if len(memory) > min_buffer: if np.mean(rewards[print_interval:]) < 0.0001: print('Bad initialization. Please restart the training.') exit() for i in range(train_steps): loss = optimize(model, target, memory, optimizer) losses.append(loss.item()) # increment() # Update target network every once in a while if episode % target_update == 0: target.load_state_dict(model.state_dict()) if episode % print_interval == 0 and episode > 0: print( "[Episode {}]\tavg rewards : {:.3f},\tavg loss: : {:.6f},\tbuffer size : {},\tepsilon : {:.1f}%" .format(episode, np.mean(rewards[print_interval:]), np.mean(losses[print_interval * 10:]), len(memory), epsilon * 100)) if ((episode) % 200) == 0: save_model(model) model = get_model() test(model, construct_task2_env(), max_episodes=10) return model
parser = argparse.ArgumentParser(description='Train and test DQN agent.') parser.add_argument('--train', dest='train', action='store_true', help='train the agent') args = parser.parse_args() def get_task(): tcs = [('t2_tmax50', 50), ('t2_tmax40', 40)] return { 'time_limit': 600, 'testcases': [{ 'id': tc, 'env': construct_task2_env(), 'runs': 300, 't_max': t_max } for tc, t_max in tcs] } task = get_task() # timed_test(task) # env = get_env() if args.train: model = train(ConvDQN, construct_task2_env()) save_model(model) else: model = get_model() test(model, construct_task2_env(), max_episodes=10)