def __init__(self, path, size=(500, 100), seed=1234, traces=True): self.size = size config = get_PPO_config(1234, use_gpu=0) trainer = ppo.PPOTrainer(config=config) trainer.restore(path) policy = trainer.get_policy() sequential_nn = convert_ray_policy_to_sequential(policy).cpu() config = {"cost_fn": 1, "simplified": True} self.env = StoppingCar(config) self.env.seed(seed) load_dataset = True file_name = "dataset_new.p" if load_dataset and traces and os.path.exists(file_name): dataset = pickle.load(open(file_name, "rb")) else: dataset = [] while len(dataset) < size[0]: state_np = self.env.reset() # only starting states state_reduced = torch.from_numpy(state_np).float().unsqueeze(0)[:, -2:] # pick just delta_x and delta_v action = torch.argmax(sequential_nn(state_reduced)).item() next_state_np, reward, done, _ = self.env.step(action) dataset.append((state_np.astype(dtype=np.float32), next_state_np.astype(dtype=np.float32), 1)) param_grid = {'delta_v': np.arange(-30, 30, 0.5), 'delta_x': np.arange(-10, 40, 0.5)} for parameters in ParameterGrid(param_grid): delta_v = parameters["delta_v"] delta_x = parameters["delta_x"] self.env.reset() self.env.x_lead = delta_x self.env.x_ego = 0 self.env.v_lead = delta_v self.env.v_ego = 0 done = False temp_dataset = [] state_np = np.array([delta_v, delta_x]) state_reduced = torch.from_numpy(state_np).float().unsqueeze(0)[:, -2:] # pick just delta_x and delta_v for i in (range(100) if traces else range(1)): # action = torch.argmax(sequential_nn(state_reduced)).item() action = self.env.perfect_action() next_state_np, reward, done, _ = self.env.step(action) temp_dataset.append((state_np, next_state_np)) state_np = next_state_np if next_state_np[1] < 0.5 and not done: done = True if done is True: # only unsafe states break if done: for state_np, next_state_np in temp_dataset: dataset.append((state_np.astype(dtype=np.float32), next_state_np.astype(dtype=np.float32), -1)) else: for state_np, next_state_np in temp_dataset: dataset.append((state_np.astype(dtype=np.float32), next_state_np.astype(dtype=np.float32), 0)) if traces: pickle.dump(dataset, open(file_name, "wb+")) self.dataset = dataset
def sample_trajectory(sample, nn, t_max, n_trajectories): n_failures = 0 for n_trajectory in range(n_trajectories): state = sample for t in range(t_max): action_score = torch.softmax(nn(torch.tensor(state, dtype=torch.float32)), 0) n_actions = len(action_score) action = np.random.choice(n_actions, p=action_score.detach().numpy()) # action = np.random.choice(n_actions) # , p=action_score.detach().numpy()) # action = 1 successor, cost, done, _ = StoppingCar.compute_successor(state, action) if done: n_failures += 1 break state = successor return n_failures
# trainer.restore("/home/edoardo/ray_results/DQN_StoppingCar_2020-12-28_15-49-16c3ga4n0f/checkpoint_12/checkpoint-12") # super safe # trainer.restore("/home/edoardo/ray_results/DQN_StoppingCar_2020-12-28_16-46-205jtg2ce8/checkpoint_8/checkpoint-8") # trainer.restore("/home/edoardo/ray_results/DQN_StoppingCar_2020-12-28_16-48-24_ovepj_6/checkpoint_27/checkpoint-27") # trainer.restore("/home/edoardo/ray_results/DQN_StoppingCar_2020-12-29_14-48-03xgehld_5/checkpoint_100/checkpoint-100") #target 0 distance # trainer.restore("/home/edoardo/ray_results/DQN_StoppingCar_2020-12-29_14-52-57j5qb7ovs/checkpoint_200/checkpoint-200") #target 20mt but keeps >0 # trainer.restore("/home/edoardo/ray_results/DQN_StoppingCar_2020-12-29_15-12-59onuvlhtv/checkpoint_400/checkpoint-400") # trainer.restore("/home/edoardo/ray_results/DQN_StoppingCar_2020-12-29_15-12-59onuvlhtv/checkpoint_560/checkpoint-560") # trainer.restore("/home/edoardo/ray_results/APEX_StoppingCar_2020-12-29_17-10-24qjvbq7ew/checkpoint_42/checkpoint-42") trainer.restore( "/home/edoardo/ray_results/APEX_StoppingCar_2020-12-30_07-08-19vc1f79qh/checkpoint_264/checkpoint-264" ) policy = trainer.get_policy() trainer.cleanup() sequential_nn = convert_DQN_ray_policy_to_sequential(policy).cpu() env = StoppingCar() state = env.reset() min_distance = 9999 cumulative_reward = 0 print(state) for i in range(1000): state_reduced = torch.from_numpy(state).float().unsqueeze(0)[:, -2:] action_score = sequential_nn(state_reduced) action = torch.argmax(action_score).item() print(f"action: {action}") state, reward, done, _ = env.step(action) min_distance = min(state[7], min_distance) cumulative_reward += reward print( f"iteration: {i}, delta_x: {state[7]:.2f}, delta_v: {state[6]:.2f}, v_ego: {state[3]:.2f},v_lead: {state[2]:.2f} , y_ego: {state[5]:.2f}, reward: {reward}" )
policy = trainer.get_policy() sequential_nn = convert_ray_policy_to_sequential(policy).cpu() l0 = torch.nn.Linear(6, 2, bias=False) l0.weight = torch.nn.Parameter( torch.tensor([[0, 0, 1, -1, 0, 0], [1, -1, 0, 0, 0, 0]], dtype=torch.float32)) layers = [l0] for l in sequential_nn: layers.append(l) sequential_nn2 = torch.nn.Sequential(*layers) plot_index = 7 x_index = 0 position_list = [] x_list = [] env = StoppingCar() env.reset() env.x_lead = 20 env.x_ego = 0 env.v_lead = 28 env.v_ego = 28 + 10 min_distance = 9999 state_np = np.array([ env.x_lead, env.x_ego, env.v_lead, env.v_ego, env.y_lead, env.y_ego, env.v_lead - env.v_ego, env.x_lead - env.x_ego ]) print(state_np) for n in range(1): cumulative_reward = 0 # env.reset() # env.x_ego = env.np_random.uniform(0, 10)
import numpy as np import torch.nn from matplotlib import cm from matplotlib.colors import Normalize import utils from environment.stopping_car import StoppingCar from runnables.invariant.retrain_agent import GridSearchDataset from training.dqn.safe_dqn_agent import InvariantAgent currentDT = datetime.datetime.now() print(f'Start at {currentDT.strftime("%Y-%m-%d %H:%M:%S")}') seed = 5 # np.random.seed(seed) config = {"cost_fn": 1, "simplified": True} env = StoppingCar(config) # env = CartPoleEnv() # gym.make("CartPole-v0") env.seed(seed) np.random.seed(seed) state_size = 2 action_size = 2 STARTING_BETA = 0.6 # the higher the more it decreases the influence of high TD transitions ALPHA = 0.6 # the higher the more aggressive the sampling towards high TD transitions EPS_DECAY = 0.2 MIN_EPS = 0.01 agent = InvariantAgent(state_size=state_size, action_size=action_size, alpha=ALPHA) agent.load(os.path.join( utils.get_save_dir(),
import matplotlib.pyplot as plt import numpy as np import torch from tensorboardX import SummaryWriter from environment.stopping_car import StoppingCar from mosaic.utils import chunks from training.dqn.safe_dqn_agent import InvariantAgent, device, SafetyLoss, TAU from utility.Scheduler import Scheduler currentDT = datetime.datetime.now() print(f'Start at {currentDT.strftime("%Y-%m-%d %H:%M:%S")}') seed = 5 # np.random.seed(seed) config = {"cost_fn": 1, "simplified": True} env = StoppingCar(config) # env = CartPoleEnv() # gym.make("CartPole-v0") env.seed(seed) np.random.seed(seed) state_size = 2 action_size = 2 STARTING_BETA = 0.6 # the higher the more it decreases the influence of high TD transitions ALPHA = 0.6 # the higher the more aggressive the sampling towards high TD transitions EPS_DECAY = 0.2 MIN_EPS = 0.01 current_time = currentDT.strftime('%b%d_%H-%M-%S') comment = f"invariant" log_dir = os.path.join('/home/edoardo/Development/SafeDRL/runs', current_time + '_' + comment) os.mkdir(log_dir)
map_location=torch.device('cpu'))) # load the invariant model # %% agent_model.cpu() invariant_model.cpu() old_agent_model.cpu() val_data = GridSearchDataset() random.seed(0) x_data = [] xprime_data = [] old_xprime_data = [] y_data = [] changed_indices = [] for i, data in enumerate(random.sample(val_data.dataset, k=7000)): value = torch.tanh(invariant_model(data)).item() action = torch.argmax(agent_model(data)).item() next_state_np, reward, done, _ = StoppingCar.compute_successor( data.numpy(), action) old_action = torch.argmax(old_agent_model(data)).item() next_state_np_old, _, _, _ = StoppingCar.compute_successor( data.numpy(), old_action) x_data.append(data.numpy()) xprime_data.append(next_state_np) y_data.append(value) old_xprime_data.append(next_state_np_old) if action != old_action: changed_indices.append(i) x_data = np.array(x_data) xprime_data = np.array(xprime_data) old_xprime_data = np.array(old_xprime_data) changed_indices = np.array(changed_indices) y_data = np.array(y_data)