Пример #1
0
 def __init__(self, path, size=(500, 100), seed=1234, traces=True):
     self.size = size
     config = get_PPO_config(1234, use_gpu=0)
     trainer = ppo.PPOTrainer(config=config)
     trainer.restore(path)
     policy = trainer.get_policy()
     sequential_nn = convert_ray_policy_to_sequential(policy).cpu()
     config = {"cost_fn": 1, "simplified": True}
     self.env = StoppingCar(config)
     self.env.seed(seed)
     load_dataset = True
     file_name = "dataset_new.p"
     if load_dataset and traces and os.path.exists(file_name):
         dataset = pickle.load(open(file_name, "rb"))
     else:
         dataset = []
         while len(dataset) < size[0]:
             state_np = self.env.reset()  # only starting states
             state_reduced = torch.from_numpy(state_np).float().unsqueeze(0)[:, -2:]  # pick just delta_x and delta_v
             action = torch.argmax(sequential_nn(state_reduced)).item()
             next_state_np, reward, done, _ = self.env.step(action)
             dataset.append((state_np.astype(dtype=np.float32), next_state_np.astype(dtype=np.float32), 1))
         param_grid = {'delta_v': np.arange(-30, 30, 0.5), 'delta_x': np.arange(-10, 40, 0.5)}
         for parameters in ParameterGrid(param_grid):
             delta_v = parameters["delta_v"]
             delta_x = parameters["delta_x"]
             self.env.reset()
             self.env.x_lead = delta_x
             self.env.x_ego = 0
             self.env.v_lead = delta_v
             self.env.v_ego = 0
             done = False
             temp_dataset = []
             state_np = np.array([delta_v, delta_x])
             state_reduced = torch.from_numpy(state_np).float().unsqueeze(0)[:, -2:]  # pick just delta_x and delta_v
             for i in (range(100) if traces else range(1)):
                 # action = torch.argmax(sequential_nn(state_reduced)).item()
                 action = self.env.perfect_action()
                 next_state_np, reward, done, _ = self.env.step(action)
                 temp_dataset.append((state_np, next_state_np))
                 state_np = next_state_np
                 if next_state_np[1] < 0.5 and not done:
                     done = True
                 if done is True:  # only unsafe states
                     break
             if done:
                 for state_np, next_state_np in temp_dataset:
                     dataset.append((state_np.astype(dtype=np.float32), next_state_np.astype(dtype=np.float32), -1))
             else:
                 for state_np, next_state_np in temp_dataset:
                     dataset.append((state_np.astype(dtype=np.float32), next_state_np.astype(dtype=np.float32), 0))
         if traces:
             pickle.dump(dataset, open(file_name, "wb+"))
     self.dataset = dataset
Пример #2
0
def sample_trajectory(sample, nn, t_max, n_trajectories):
    n_failures = 0
    for n_trajectory in range(n_trajectories):
        state = sample
        for t in range(t_max):
            action_score = torch.softmax(nn(torch.tensor(state, dtype=torch.float32)), 0)
            n_actions = len(action_score)
            action = np.random.choice(n_actions, p=action_score.detach().numpy())
            # action = np.random.choice(n_actions)  # , p=action_score.detach().numpy())
            # action = 1
            successor, cost, done, _ = StoppingCar.compute_successor(state, action)
            if done:
                n_failures += 1
                break
            state = successor
    return n_failures
Пример #3
0
# trainer.restore("/home/edoardo/ray_results/DQN_StoppingCar_2020-12-28_15-49-16c3ga4n0f/checkpoint_12/checkpoint-12")  # super safe
# trainer.restore("/home/edoardo/ray_results/DQN_StoppingCar_2020-12-28_16-46-205jtg2ce8/checkpoint_8/checkpoint-8")
# trainer.restore("/home/edoardo/ray_results/DQN_StoppingCar_2020-12-28_16-48-24_ovepj_6/checkpoint_27/checkpoint-27")
# trainer.restore("/home/edoardo/ray_results/DQN_StoppingCar_2020-12-29_14-48-03xgehld_5/checkpoint_100/checkpoint-100") #target 0 distance
# trainer.restore("/home/edoardo/ray_results/DQN_StoppingCar_2020-12-29_14-52-57j5qb7ovs/checkpoint_200/checkpoint-200") #target 20mt but keeps >0
# trainer.restore("/home/edoardo/ray_results/DQN_StoppingCar_2020-12-29_15-12-59onuvlhtv/checkpoint_400/checkpoint-400")
# trainer.restore("/home/edoardo/ray_results/DQN_StoppingCar_2020-12-29_15-12-59onuvlhtv/checkpoint_560/checkpoint-560")
# trainer.restore("/home/edoardo/ray_results/APEX_StoppingCar_2020-12-29_17-10-24qjvbq7ew/checkpoint_42/checkpoint-42")
trainer.restore(
    "/home/edoardo/ray_results/APEX_StoppingCar_2020-12-30_07-08-19vc1f79qh/checkpoint_264/checkpoint-264"
)

policy = trainer.get_policy()
trainer.cleanup()
sequential_nn = convert_DQN_ray_policy_to_sequential(policy).cpu()
env = StoppingCar()
state = env.reset()
min_distance = 9999
cumulative_reward = 0
print(state)
for i in range(1000):
    state_reduced = torch.from_numpy(state).float().unsqueeze(0)[:, -2:]
    action_score = sequential_nn(state_reduced)
    action = torch.argmax(action_score).item()
    print(f"action: {action}")
    state, reward, done, _ = env.step(action)
    min_distance = min(state[7], min_distance)
    cumulative_reward += reward
    print(
        f"iteration: {i}, delta_x: {state[7]:.2f}, delta_v: {state[6]:.2f}, v_ego: {state[3]:.2f},v_lead: {state[2]:.2f} , y_ego: {state[5]:.2f}, reward: {reward}"
    )
Пример #4
0
policy = trainer.get_policy()
sequential_nn = convert_ray_policy_to_sequential(policy).cpu()
l0 = torch.nn.Linear(6, 2, bias=False)
l0.weight = torch.nn.Parameter(
    torch.tensor([[0, 0, 1, -1, 0, 0], [1, -1, 0, 0, 0, 0]],
                 dtype=torch.float32))
layers = [l0]
for l in sequential_nn:
    layers.append(l)

sequential_nn2 = torch.nn.Sequential(*layers)
plot_index = 7
x_index = 0
position_list = []
x_list = []
env = StoppingCar()
env.reset()
env.x_lead = 20
env.x_ego = 0
env.v_lead = 28
env.v_ego = 28 + 10
min_distance = 9999
state_np = np.array([
    env.x_lead, env.x_ego, env.v_lead, env.v_ego, env.y_lead, env.y_ego,
    env.v_lead - env.v_ego, env.x_lead - env.x_ego
])
print(state_np)
for n in range(1):
    cumulative_reward = 0
    # env.reset()
    # env.x_ego = env.np_random.uniform(0, 10)
Пример #5
0
import numpy as np
import torch.nn
from matplotlib import cm
from matplotlib.colors import Normalize

import utils
from environment.stopping_car import StoppingCar
from runnables.invariant.retrain_agent import GridSearchDataset
from training.dqn.safe_dqn_agent import InvariantAgent

currentDT = datetime.datetime.now()
print(f'Start at {currentDT.strftime("%Y-%m-%d %H:%M:%S")}')
seed = 5
# np.random.seed(seed)
config = {"cost_fn": 1, "simplified": True}
env = StoppingCar(config)
# env = CartPoleEnv()  # gym.make("CartPole-v0")
env.seed(seed)
np.random.seed(seed)
state_size = 2
action_size = 2
STARTING_BETA = 0.6  # the higher the more it decreases the influence of high TD transitions
ALPHA = 0.6  # the higher the more aggressive the sampling towards high TD transitions
EPS_DECAY = 0.2
MIN_EPS = 0.01

agent = InvariantAgent(state_size=state_size,
                       action_size=action_size,
                       alpha=ALPHA)
agent.load(os.path.join(
    utils.get_save_dir(),
Пример #6
0
import matplotlib.pyplot as plt
import numpy as np
import torch
from tensorboardX import SummaryWriter

from environment.stopping_car import StoppingCar
from mosaic.utils import chunks
from training.dqn.safe_dqn_agent import InvariantAgent, device, SafetyLoss, TAU
from utility.Scheduler import Scheduler

currentDT = datetime.datetime.now()
print(f'Start at {currentDT.strftime("%Y-%m-%d %H:%M:%S")}')
seed = 5
# np.random.seed(seed)
config = {"cost_fn": 1, "simplified": True}
env = StoppingCar(config)
# env = CartPoleEnv()  # gym.make("CartPole-v0")
env.seed(seed)
np.random.seed(seed)
state_size = 2
action_size = 2
STARTING_BETA = 0.6  # the higher the more it decreases the influence of high TD transitions
ALPHA = 0.6  # the higher the more aggressive the sampling towards high TD transitions
EPS_DECAY = 0.2
MIN_EPS = 0.01

current_time = currentDT.strftime('%b%d_%H-%M-%S')
comment = f"invariant"
log_dir = os.path.join('/home/edoardo/Development/SafeDRL/runs',
                       current_time + '_' + comment)
os.mkdir(log_dir)
Пример #7
0
                map_location=torch.device('cpu')))  # load the invariant model
    # %%
    agent_model.cpu()
    invariant_model.cpu()
    old_agent_model.cpu()
    val_data = GridSearchDataset()
    random.seed(0)
    x_data = []
    xprime_data = []
    old_xprime_data = []
    y_data = []
    changed_indices = []
    for i, data in enumerate(random.sample(val_data.dataset, k=7000)):
        value = torch.tanh(invariant_model(data)).item()
        action = torch.argmax(agent_model(data)).item()
        next_state_np, reward, done, _ = StoppingCar.compute_successor(
            data.numpy(), action)
        old_action = torch.argmax(old_agent_model(data)).item()
        next_state_np_old, _, _, _ = StoppingCar.compute_successor(
            data.numpy(), old_action)
        x_data.append(data.numpy())
        xprime_data.append(next_state_np)
        y_data.append(value)
        old_xprime_data.append(next_state_np_old)
        if action != old_action:
            changed_indices.append(i)

    x_data = np.array(x_data)
    xprime_data = np.array(xprime_data)
    old_xprime_data = np.array(old_xprime_data)
    changed_indices = np.array(changed_indices)
    y_data = np.array(y_data)