def __init__(self, args, shared_value, share_net):
        seed = args.seed
        np.random.seed(seed)
        torch.manual_seed(seed)
        self.stop_sign = shared_value[1]
        self.iteration_counter = shared_value[2]
        self.iteration = self.iteration_counter.value
        self.share_net = share_net
        self.args = args
        self.env = gym.make(args.env_name)
        self.device = torch.device("cpu")
        self.actor = PolicyNet(args).to(self.device)
        self.Q_net1 = QNet(args).to(self.device)
        self.Q_net2 = QNet(args).to(self.device)
        self.actor_share = share_net[4]
        self.Q_net1_share = share_net[0]
        self.Q_net2_share = share_net[2]
        self.log_alpha_share = share_net[-1]
        self.alpha = np.exp(self.log_alpha_share.detach().item()) if args.alpha == 'auto' else 0

        self.evaluation_interval = 50000
        self.max_state_num_evaluated_in_an_episode = 500
        self.episode_num_to_run = 10
        self.iteration_history = []
        self.evaluated_Q_mean_history=[]
        self.true_gamma_return_mean_history=[]
        # self.n_episodes_info_history = []
        self.evaluated_Q_history = []
        self.true_gamma_return_history = []
示例#2
0
    def __init__(self, state_size, action_size, seed, double_agent=False,dueling_agent=False):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
	    double_agent(bool) : True if we want to use DDQN
            dueling_agent (bool): True if we want to use Dueling
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.double_agent=double_agent
        self.dueling_agent=dueling_agent

        self.qnetwork_local = QNet(state_size, action_size, seed,dueling_agent=dueling_agent).to(device)
        self.qnetwork_target = QNet(state_size, action_size, seed,dueling_agent=dueling_agent).to(device)
        self.optimizer = optim.RMSprop(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
示例#3
0
    def __init__(self, args, shared_value, share_net):
        seed = args.seed
        np.random.seed(seed)
        torch.manual_seed(seed)

        eval_params = {
            'obs_size': (160, 100),  # screen size of cv2 window
            'dt': 0.025,  # time interval between two frames
            'ego_vehicle_filter':
            'vehicle.lincoln*',  # filter for defining ego vehicle
            'port': int(2000 + 3 * args.num_actors),  # connection port
            'task_mode':
            'Straight',  # mode of the task, [random, roundabout (only for Town03)]
            'code_mode': 'test',
            'max_time_episode': 100,  # maximum timesteps per episode
            'desired_speed': 15,  # desired speed (m/s)
            'max_ego_spawn_times': 100,  # maximum times to spawn ego vehicle
        }

        self.stop_sign = shared_value[1]
        self.iteration_counter = shared_value[2]
        self.iteration = self.iteration_counter.value
        self.share_net = share_net
        self.args = args
        self.env = gym.make(args.env_name, params=eval_params)
        self.device = torch.device("cpu")
        self.actor = PolicyNet(args).to(self.device)
        self.Q_net1 = QNet(args).to(self.device)
        self.Q_net2 = QNet(args).to(self.device)
        self.actor_share = share_net[4]
        self.Q_net1_share = share_net[0]
        self.Q_net2_share = share_net[2]
        self.log_alpha_share = share_net[-1]
        self.alpha = np.exp(self.log_alpha_share.detach().item()
                            ) if args.alpha == 'auto' else 0

        self.evaluation_interval = 20000
        self.max_state_num_evaluated_in_an_episode = 50  # 500
        self.episode_num_evaluation = 5
        self.episode_num_test = 5
        self.time = time.time()
        self.list_of_n_episode_rewards_history = []
        self.time_history = []
        self.alpha_history = []
        self.average_return_with_diff_base_history = []
        self.average_reward_history = []
        self.iteration_history = []
        self.evaluated_Q_mean_history = []
        self.evaluated_Q_std_history = []
        self.true_gamma_return_mean_history = []
        self.policy_entropy_history = []
        self.a_std_history = []
        self.a_abs_history = []
示例#4
0
    def __init__(self, args,shared_value):
        super(Simulation, self).__init__()
        seed = args.seed
        np.random.seed(seed)
        torch.manual_seed(seed)

        simu_params = {
            'number_of_vehicles': 0,
            'number_of_walkers': 0,
            'obs_size': (160, 100),  # screen size of cv2 window
            'dt': 0.025,  # time interval between two frames
            'ego_vehicle_filter': 'vehicle.lincoln*',  # filter for defining ego vehicle
            'port': 2000,  # connection port
            'task_mode': 'Straight',  # mode of the task, [random, roundabout (only for Town03)]
            'code_mode': 'test',
            'max_time_episode': 100,  # maximum timesteps per episode
            'desired_speed': 15,  # desired speed (m/s)
            'max_ego_spawn_times': 100,  # maximum times to spawn ego vehicle
        }

        self.stop_sign = shared_value[1]
        self.args = args
        self.env = gym.make(args.env_name, params=simu_params)
        self.device = torch.device("cpu")
        self.load_index = self.args.max_train
        # self.load_index = 40000

        self.actor = PolicyNet(args).to(self.device)
        self.actor.load_state_dict(torch.load('./'+self.args.env_name+'/method_' + str(self.args.method) + '/model/policy1_' + str(self.load_index) + '.pkl',map_location='cpu'))

        self.Q_net1 = QNet(args).to(self.device)
        self.Q_net1.load_state_dict(torch.load('./'+self.args.env_name+'/method_' + str(self.args.method) + '/model/Q1_' + str(self.load_index) + '.pkl',map_location='cpu'))

        if self.args.double_Q:
            self.Q_net2 = QNet(args).to(self.device)
            self.Q_net2.load_state_dict(torch.load('./'+self.args.env_name+'/method_' + str(self.args.method) + '/model/Q2_' + str(self.load_index) + '.pkl',map_location='cpu'))


        self.test_step = 0
        self.save_interval = 10000
        self.iteration = 0
        self.reward_history = []
        self.entropy_history = []
        self.epoch_history =[]
        self.done_history = []
        self.Q_real_history = []
        self.Q_history =[]
        self.Q_std_history = []
    def __init__(self, args, shared_value):
        super(Simulation, self).__init__()
        seed = args.seed
        np.random.seed(seed)
        torch.manual_seed(seed)
        self.stop_sign = shared_value[1]
        self.args = args
        self.env = gym.make(args.env_name)
        self.device = torch.device("cpu")
        self.load_index = self.args.max_train

        self.actor = PolicyNet(args).to(self.device)
        self.actor.load_state_dict(
            torch.load('./' + self.args.env_name + '/method_' +
                       str(self.args.method) + '/model/policy1_' +
                       str(self.load_index) + '.pkl',
                       map_location='cpu'))

        self.Q_net1 = QNet(args).to(self.device)
        self.Q_net1.load_state_dict(
            torch.load('./' + self.args.env_name + '/method_' +
                       str(self.args.method) + '/model/Q1_' +
                       str(self.load_index) + '.pkl',
                       map_location='cpu'))

        if self.args.double_Q:
            self.Q_net2 = QNet(args).to(self.device)
            self.Q_net2.load_state_dict(
                torch.load('./' + self.args.env_name + '/method_' +
                           str(self.args.method) + '/model/Q2_' +
                           str(self.load_index) + '.pkl',
                           map_location='cpu'))

        self.test_step = 0
        self.save_interval = 10000
        self.iteration = 0
        self.reward_history = []
        self.entropy_history = []
        self.epoch_history = []
        self.done_history = []
        self.Q_real_history = []
        self.Q_history = []
        self.Q_std_history = []
示例#6
0
    def __init__(self, args, shared_queue,shared_value):
        super(Simulation, self).__init__()
        seed = args.seed
        np.random.seed(seed)
        torch.manual_seed(seed)
        self.policy_test_queue = shared_queue[3]
        self.stop_sign = shared_value[1]
        self.args = args
        self.env = gym.make(args.env_name)
        self.device = torch.device("cpu")
        self.load_index = 20000



        self.actor = PolicyNet(args.state_dim, args.num_hidden_cell, args.action_high, args.action_low,args.NN_type).to(self.device)
        self.actor.load_state_dict(torch.load('./data/method_' + str(1) + '/model/policy_' + str(self.load_index) + '.pkl'))

        self.Q_net1_m0 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device)
        self.Q_net1_m0.load_state_dict(torch.load('./data/method_' + str(0) + '/model/Q1_' + str(self.load_index) + '.pkl'))

        self.Q_net1_m1 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device)
        self.Q_net1_m1.load_state_dict(torch.load('./data/method_' + str(1) + '/model/Q1_' + str(self.load_index) + '.pkl'))
        self.Q_net2_m1 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device)
        self.Q_net2_m1.load_state_dict(torch.load('./data/method_' + str(1) + '/model/Q2_' + str(self.load_index) + '.pkl'))

        self.Q_net1_m2 = QNet(args.state_dim, args.action_dim, args.num_hidden_cell, args.NN_type).to(self.device)
        self.Q_net1_m2.load_state_dict(torch.load('./data/method_' + str(2) + '/model/Q1_' + str(self.load_index) + '.pkl'))



        self.test_step = 0

        self.save_interval = 10000
        self.iteration = 0
        self.reward_history = []
        self.entropy_history = []
        self.epoch_history =[]
        self.done_history = []
        self.Q_real_history = []
        self.Q_m0_history =[]
        self.Q_m1_history = []
        self.Q_m2_history = []
        self.Q_std_m2_history = []
示例#7
0
    def __init__(self, args, shared_value, share_net):
        seed = args.seed
        np.random.seed(seed)
        torch.manual_seed(seed)
        self.stop_sign = shared_value[1]
        self.iteration_counter = shared_value[2]
        self.iteration = self.iteration_counter.value
        self.share_net = share_net
        self.args = args
        self.env = gym.make(args.env_name)
        self.device = torch.device("cpu")
        self.actor = PolicyNet(args).to(self.device)
        self.Q_net1 = QNet(args).to(self.device)
        self.Q_net2 = QNet(args).to(self.device)
        self.actor_share = share_net[4]
        self.Q_net1_share = share_net[0]
        self.Q_net2_share = share_net[2]
        self.log_alpha_share = share_net[-1]
        self.alpha = np.exp(self.log_alpha_share.detach().item()
                            ) if args.alpha == 'auto' else 0

        self.evaluation_interval = 20000
        self.max_state_num_evaluated_in_an_episode = 500
        self.episode_num_evaluation = 5
        self.episode_num_test = 5
        self.time = time.time()
        self.list_of_n_episode_rewards_history = []
        self.time_history = []
        self.alpha_history = []
        self.average_return_with_diff_base_history = []
        self.average_reward_history = []
        self.iteration_history = []
        self.evaluated_Q_mean_history = []
        self.evaluated_Q_std_history = []
        self.true_gamma_return_mean_history = []
        self.policy_entropy_history = []
        self.a_std_history = []
        self.a_abs_history = []
示例#8
0
 def __init__(self, args, shared_queue, shared_value, lock, i):
     super(Actor, self).__init__()
     self.agent_id = i
     seed = args.seed + np.int64(self.agent_id)
     np.random.seed(seed)
     torch.manual_seed(seed)
     self.experience_queue = shared_queue[0]
     self.policy_param_queue = shared_queue[1]
     self.q_param_queue = shared_queue[2]
     self.counter = shared_value[0]
     self.stop_sign = shared_value[1]
     self.lock = lock
     self.env = gym.make(args.env_name)
     self.args = args
     self.device = torch.device("cpu")
     self.actor = PolicyNet(args.state_dim, args.num_hidden_cell,
                            args.action_high, args.action_low,
                            args.NN_type).to(self.device)
     self.Q_net1 = QNet(args.state_dim, args.action_dim,
                        args.num_hidden_cell, args.NN_type).to(self.device)
示例#9
0
    def __init__(self, args, shared_queue, shared_value, share_net, lock, i):
        super(Actor, self).__init__()
        self.agent_id = i
        seed = args.seed + np.int64(self.agent_id)
        np.random.seed(seed)
        torch.manual_seed(seed)

        self.counter = shared_value[0]
        self.stop_sign = shared_value[1]
        self.lock = lock
        self.env = gym.make(args.env_name)
        self.args = args
        self.experience_in_queue = []
        for i in range(args.num_buffers):
            self.experience_in_queue.append(shared_queue[0][i])

        self.device = torch.device("cpu")
        self.actor = PolicyNet(args).to(self.device)
        self.Q_net1 = QNet(args).to(self.device)

        self.Q_net1_share = share_net[1]
        self.actor_share = share_net[0]
from __future__ import print_function
示例#11
0
    def __init__(self, args, shared_queue, shared_value, share_net,
                 share_optimizer, device, lock, i):
        super(Learner, self).__init__()
        self.args = args
        seed = self.args.seed
        self.init_time = self.args.init_time
        np.random.seed(seed)
        torch.manual_seed(seed)
        self.agent_id = i

        self.experience_out_queue = []
        for i in range(args.num_buffers):
            self.experience_out_queue.append(shared_queue[1][i])

        self.stop_sign = shared_value[1]
        self.iteration_counter = shared_value[2]
        self.iteration = self.iteration_counter.value

        self.device = device
        if self.device == torch.device("cpu"):
            self.gpu = False
        else:
            self.gpu = True
        self.lock = lock

        self.Q_net1_share, self.Q_net1_target_share, self.Q_net2_share, self.Q_net2_target_share, self.actor1_share, \
                                        self.actor1_target_share, self.actor2_share, self.actor2_target_share, self.log_alpha_share = share_net
        self.Q_net1_optimizer, self.Q_net2_optimizer, self.actor1_optimizer, self.actor2_optimizer, self.alpha_optimizer = share_optimizer

        self.Q_net1 = QNet(args).to(self.device)
        self.scheduler_Q_net1 = lr_scheduler.CosineAnnealingLR(
            self.Q_net1_optimizer,
            T_max=self.args.decay_T_max,
            eta_min=self.args.end_lr,
            last_epoch=-1)
        self.Q_net1.train()

        self.Q_net1_target = QNet(args).to(self.device)
        self.Q_net1_target.train()

        self.Q_net2 = QNet(args).to(self.device)
        self.scheduler_Q_net2 = lr_scheduler.CosineAnnealingLR(
            self.Q_net2_optimizer,
            T_max=self.args.decay_T_max,
            eta_min=self.args.end_lr,
            last_epoch=-1)
        self.Q_net2.train()

        self.Q_net2_target = QNet(args).to(self.device)
        self.Q_net2_target.train()

        self.actor1 = PolicyNet(args).to(self.device)
        self.scheduler_actor1 = lr_scheduler.CosineAnnealingLR(
            self.actor1_optimizer,
            T_max=self.args.decay_T_max,
            eta_min=self.args.end_lr,
            last_epoch=-1)
        self.actor1.train()

        self.actor1_target = PolicyNet(args).to(self.device)
        self.actor1_target.train()

        self.actor2 = PolicyNet(args).to(self.device)
        self.scheduler_actor2 = lr_scheduler.CosineAnnealingLR(
            self.actor2_optimizer,
            T_max=self.args.decay_T_max,
            eta_min=self.args.end_lr,
            last_epoch=-1)
        self.actor2.train()

        self.actor2_target = PolicyNet(args).to(self.device)
        self.actor2_target.train()

        self.scheduler_alpha = lr_scheduler.CosineAnnealingLR(
            self.alpha_optimizer,
            T_max=self.args.decay_T_max,
            eta_min=self.args.end_lr,
            last_epoch=-1)

        if self.args.alpha == 'auto':
            self.target_entropy = args.target_entropy
        else:
            self.alpha = torch.tensor(self.args.alpha)
def main(method):
    args = built_parser(method=method)
    env = gym.make(args.env_name)
    state_dim = env.observation_space.shape
    action_dim = env.action_space.shape[0]

    args.state_dim = state_dim
    args.action_dim = action_dim
    action_high = env.action_space.high
    action_low = env.action_space.low
    args.action_high = action_high.tolist()
    args.action_low = action_low.tolist()
    args.seed = np.random.randint(0, 30)
    args.init_time = time.time()

    if args.alpha == 'auto' and args.target_entropy == 'auto':
        delta_a = np.array(args.action_high, dtype=np.float32) - np.array(
            args.action_low, dtype=np.float32)
        args.target_entropy = -1 * args.action_dim  #+ sum(np.log(delta_a/2))

    Q_net1 = QNet(args)
    Q_net1.train()
    Q_net1.share_memory()
    Q_net1_target = QNet(args)
    Q_net1_target.train()
    Q_net1_target.share_memory()
    Q_net2 = QNet(args)
    Q_net2.train()
    Q_net2.share_memory()
    Q_net2_target = QNet(args)
    Q_net2_target.train()
    Q_net2_target.share_memory()
    actor1 = PolicyNet(args)

    actor1.train()
    actor1.share_memory()
    actor1_target = PolicyNet(args)
    actor1_target.train()
    actor1_target.share_memory()
    actor2 = PolicyNet(args)
    actor2.train()
    actor2.share_memory()
    actor2_target = PolicyNet(args)
    actor2_target.train()
    actor2_target.share_memory()

    Q_net1_target.load_state_dict(Q_net1.state_dict())
    Q_net2_target.load_state_dict(Q_net2.state_dict())
    actor1_target.load_state_dict(actor1.state_dict())
    actor2_target.load_state_dict(actor2.state_dict())

    Q_net1_optimizer = my_optim.SharedAdam(Q_net1.parameters(),
                                           lr=args.critic_lr)
    Q_net1_optimizer.share_memory()
    Q_net2_optimizer = my_optim.SharedAdam(Q_net2.parameters(),
                                           lr=args.critic_lr)
    Q_net2_optimizer.share_memory()
    actor1_optimizer = my_optim.SharedAdam(actor1.parameters(),
                                           lr=args.actor_lr)
    actor1_optimizer.share_memory()
    actor2_optimizer = my_optim.SharedAdam(actor2.parameters(),
                                           lr=args.actor_lr)
    actor2_optimizer.share_memory()
    log_alpha = torch.zeros(1, dtype=torch.float32, requires_grad=True)
    log_alpha.share_memory_()
    alpha_optimizer = my_optim.SharedAdam([log_alpha], lr=args.alpha_lr)
    alpha_optimizer.share_memory()

    share_net = [
        Q_net1, Q_net1_target, Q_net2, Q_net2_target, actor1, actor1_target,
        actor2, actor2_target, log_alpha
    ]
    share_optimizer = [
        Q_net1_optimizer, Q_net2_optimizer, actor1_optimizer, actor2_optimizer,
        alpha_optimizer
    ]

    experience_in_queue = []
    experience_out_queue = []
    for i in range(args.num_buffers):
        experience_in_queue.append(Queue(maxsize=10))
        experience_out_queue.append(Queue(maxsize=10))
    shared_queue = [experience_in_queue, experience_out_queue]
    step_counter = mp.Value('i', 0)
    stop_sign = mp.Value('i', 0)
    iteration_counter = mp.Value('i', 0)
    shared_value = [step_counter, stop_sign, iteration_counter]
    lock = mp.Lock()
    procs = []
    if args.code_model == "train":
        for i in range(args.num_actors):
            procs.append(
                Process(target=actor_agent,
                        args=(args, shared_queue, shared_value,
                              [actor1, Q_net1], lock, i)))
        for i in range(args.num_buffers):
            procs.append(
                Process(target=buffer,
                        args=(args, shared_queue, shared_value, i)))
        procs.append(
            Process(target=evaluate_agent,
                    args=(args, shared_value, share_net)))
        for i in range(args.num_learners):
            #device = torch.device("cuda")
            device = torch.device("cpu")
            procs.append(
                Process(target=leaner_agent,
                        args=(args, shared_queue, shared_value, share_net,
                              share_optimizer, device, lock, i)))
    elif args.code_model == "simu":
        procs.append(Process(target=simu_agent, args=(args, shared_value)))

    for p in procs:
        p.start()
    for p in procs:
        p.join()
示例#13
0
def main(method):

    params = {
        'obs_size': (160, 100),  # screen size of cv2 window
        'dt': 0.025,  # time interval between two frames
        'ego_vehicle_filter':
        'vehicle.lincoln*',  # filter for defining ego vehicle
        'port': 2000,  # connection port
        'task_mode':
        'Straight',  # mode of the task, [random, roundabout (only for Town03)]
        'code_mode': 'train',
        'max_time_episode': 100,  # maximum timesteps per episode
        'desired_speed': 15,  # desired speed (m/s)
        'max_ego_spawn_times': 100,  # maximum times to spawn ego vehicle
    }

    args = built_parser(method=method)
    env = gym.make(args.env_name, params=params)
    state_dim = env.state_space.shape
    action_dim = env.action_space.shape[0]

    args.state_dim = state_dim
    args.action_dim = action_dim
    action_high = env.action_space.high
    action_low = env.action_space.low
    args.action_high = action_high.tolist()
    args.action_low = action_low.tolist()
    args.seed = np.random.randint(0, 30)
    args.init_time = time.time()
    num_cpu = mp.cpu_count()
    print(state_dim, action_dim, action_high, num_cpu)

    if args.alpha == 'auto' and args.target_entropy == 'auto':
        delta_a = np.array(args.action_high, dtype=np.float32) - np.array(
            args.action_low, dtype=np.float32)
        args.target_entropy = -1 * args.action_dim  # + sum(np.log(delta_a/2))

    Q_net1 = QNet(args)
    Q_net1.train()
    Q_net1.share_memory()
    Q_net1_target = QNet(args)
    Q_net1_target.train()
    Q_net1_target.share_memory()
    Q_net2 = QNet(args)
    Q_net2.train()
    Q_net2.share_memory()
    Q_net2_target = QNet(args)
    Q_net2_target.train()
    Q_net2_target.share_memory()
    actor1 = PolicyNet(args)

    print("Network inited")

    if args.code_model == "eval":
        actor1.load_state_dict(
            torch.load('./' + args.env_name + '/method_' + str(args.method) +
                       '/model/policy_' + str(args.max_train) + '.pkl'))
    actor1.train()
    actor1.share_memory()
    actor1_target = PolicyNet(args)
    actor1_target.train()
    actor1_target.share_memory()
    actor2 = PolicyNet(args)
    actor2.train()
    actor2.share_memory()
    actor2_target = PolicyNet(args)
    actor2_target.train()
    actor2_target.share_memory()

    print("Network set")

    Q_net1_target.load_state_dict(Q_net1.state_dict())
    Q_net2_target.load_state_dict(Q_net2.state_dict())
    actor1_target.load_state_dict(actor1.state_dict())
    actor2_target.load_state_dict(actor2.state_dict())

    print("Network loaded!")

    Q_net1_optimizer = my_optim.SharedAdam(Q_net1.parameters(),
                                           lr=args.critic_lr)
    Q_net1_optimizer.share_memory()
    Q_net2_optimizer = my_optim.SharedAdam(Q_net2.parameters(),
                                           lr=args.critic_lr)
    Q_net2_optimizer.share_memory()
    actor1_optimizer = my_optim.SharedAdam(actor1.parameters(),
                                           lr=args.actor_lr)
    actor1_optimizer.share_memory()
    actor2_optimizer = my_optim.SharedAdam(actor2.parameters(),
                                           lr=args.actor_lr)
    actor2_optimizer.share_memory()
    log_alpha = torch.zeros(1, dtype=torch.float32, requires_grad=True)
    log_alpha.share_memory_()
    alpha_optimizer = my_optim.SharedAdam([log_alpha], lr=args.alpha_lr)
    alpha_optimizer.share_memory()

    print("Optimizer done")

    share_net = [
        Q_net1, Q_net1_target, Q_net2, Q_net2_target, actor1, actor1_target,
        actor2, actor2_target, log_alpha
    ]
    share_optimizer = [
        Q_net1_optimizer, Q_net2_optimizer, actor1_optimizer, actor2_optimizer,
        alpha_optimizer
    ]

    experience_in_queue = []
    experience_out_queue = []
    for i in range(args.num_buffers):
        experience_in_queue.append(Queue(maxsize=10))
        experience_out_queue.append(Queue(maxsize=10))
    shared_queue = [experience_in_queue, experience_out_queue]
    step_counter = mp.Value('i', 0)
    stop_sign = mp.Value('i', 0)
    iteration_counter = mp.Value('i', 0)
    shared_value = [step_counter, stop_sign, iteration_counter]
    lock = mp.Lock()
    procs = []
    if args.code_model == "train":
        for i in range(args.num_learners):
            if i % 2 == 0:
                device = torch.device("cuda:1")
            else:
                device = torch.device("cuda:0")
            # device = torch.device("cpu")
            procs.append(
                Process(target=leaner_agent,
                        args=(args, shared_queue, shared_value, share_net,
                              share_optimizer, device, lock, i)))
        for i in range(args.num_actors):
            procs.append(
                Process(target=actor_agent,
                        args=(args, shared_queue, shared_value,
                              [actor1, Q_net1], lock, i)))
        for i in range(args.num_buffers):
            procs.append(
                Process(target=buffer,
                        args=(args, shared_queue, shared_value, i)))
        procs.append(
            Process(target=evaluate_agent,
                    args=(args, shared_value, share_net)))
    elif args.code_model == "simu":
        procs.append(Process(target=simu_agent, args=(args, shared_value)))

    for p in procs:
        p.start()
    for p in procs:
        p.join()
示例#14
0
def main():
    # parameters for the gym_carla environment
    params = {
        'display_size': 256,  # screen size of bird-eye render
        'obs_size': 128,  # screen size of cv2 window
        'dt': 0.1,  # time interval between two frames
        'ego_vehicle_filter': 'vehicle.lincoln*',  # filter for defining ego vehicle
        'port': 2000,  # connection port
        # 'town': 'Town01',  # which town to simulate
        'task_mode': 'Straight',  # mode of the task, [random, roundabout (only for Town03)]
        'code_mode': 'test',
        'max_time_episode': 5000,  # maximum timesteps per episode
        'desired_speed': 8,  # desired speed (m/s)
        'max_ego_spawn_times': 100,  # maximum times to spawn ego vehicle
    }

    # Set gym-carla environment
    env = gym.make('carla-v0', params=params)

    # load net
    device = torch.device('cpu')
    args = Args()
    args.NN_type
    actor = PolicyNet(args).to(device)
    actor.load_state_dict(torch.load('./policy1_500000.pkl',map_location='cpu'))

    Q_net1 = QNet(args).to(device)
    Q_net1.load_state_dict(torch.load('./Q1_500000.pkl',map_location='cpu'))

    obs, info_dict = env.reset()
    info = info_dict_to_array(info_dict)

    state_tensor = torch.FloatTensor(obs.copy()).float().to(device)
    info_tensor = torch.FloatTensor(info.copy()).float().to(device)

    # print(env.ego.get_location())
    tic = time.time()
    done = False
    ret = 0
    start = carla.Location(x=env.start[0], y=env.start[1], z=0.22)
    end = carla.Location(x=env.dest[0], y=env.dest[1], z=0.22)

    if args.NN_type == "CNN":
        state_tensor = state_tensor.permute(2, 0, 1)

    while not done:
        tac = time.time()

        u, log_prob = actor.get_action(state_tensor.unsqueeze(0), info_tensor.unsqueeze(0), True)
        u = u.squeeze(0)

        obs, r, done, info = env.step(u)

        info = info_dict_to_array(info_dict)
        state_tensor = torch.FloatTensor(obs.copy()).float().to(device)
        if args.NN_type == "CNN":
            state_tensor = state_tensor.permute(2, 0, 1)
        info_tensor = torch.FloatTensor(info.copy()).float().to(device)

        ret += r
        cv2.imshow("camera img", obs)
        cv2.waitKey(1)
        # print(info['acceleration_t'].shape)
        env.world.debug.draw_point(start)
        env.world.debug.draw_point(end)

        if done:
            toc = time.time()
            print("An episode took %f s" %(toc - tic))
            print("total reward is", ret)
            print("time steps", env.time_step)
            env.close()
            env.reset()
            ret = 0
            # print(env.ego.get_location())
            done = False
示例#15
0
        # 1	Cart Velocity             -Inf            Inf
        int(check_bound(np.degrees(observation[2]), np.arange(-11, 11, 1))),
        # 2	Pole Angle                 -24 deg        24 deg
        int(check_bound(observation[3], np.arange(-0.88, 0.88, 0.08)))
        # 3	Pole Velocity At Tip      -Inf            Inf
    ]


# Create Agent
actions = range(env.action_space.n)
agent = Agent(None, (25, 25, 25), actions)
temp_agent = agent.__copy__()

# Create Network
net_size = 128
net = QNet(env.observation_space.shape[0], env.action_space.n, net_size,
           device).to(device)
optimizer = optim.Adam(net.parameters(), lr=1e-3)
net.train()

ok = False
guts = 0
i_episode = 0
total = 0
loss = 0
guts_required = 100
guts_print_div = 10
big_data = [[], []]
print("Learning...")
while not ok:
    # Agent learning
    while guts < guts_required: