Exemplo n.º 1
0
    def __init__(self, num_inputs, action_space, param, action_bound):

        self.action_bound = action_bound
        self.alpha = param['alpha']
        self.gamma = param['gamma']
        self.tau = param['tau']
        self.target_update_interval = param['target_update_interval']
        self.automatic_entropy_tuning = param['automatic_entropy_tuning']
        self.lr = param['lr']

        self.device = torch.device("cuda" if param['cuda'] else "cpu")

        self.critic = QNetwork(num_inputs, action_space).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=self.lr)

        self.critic_target = QNetwork(num_inputs, action_space).to(self.device)
        hard_update(self.critic_target, self.critic)

        # Target Entropy = -dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
        if self.automatic_entropy_tuning is True:
            self.target_entropy = -torch.prod(
                torch.Tensor(action_space).to(self.device)).item()
            self.log_alpha = torch.zeros(1,
                                         requires_grad=True,
                                         device=self.device)
            self.alpha_optim = Adam([self.log_alpha], lr=param['lr'])

        self.policy = CNNPolicy(num_inputs, action_space).to(self.device)
        self.policy_optim = Adam(self.policy.parameters(), lr=param['lr'])
def run():

    # Set parameters of env
    LASER_HIST = 3
    NUM_ENV = 1  # the number of agents in the environment
    OBS_SIZE = 512  # number of leaserbeam
    action_bound = [[0, -1], [1, 1]]  # the limitation of velocity

    # Set env and agent policy
    env = StageWorld(
        OBS_SIZE, index=0, num_env=NUM_ENV
    )  #index is useful for parallel programming, 0 is for the first agent
    trained_model_file = os.path.dirname(__file__) + '/policy/stage2.pth'
    policy = CNNPolicy(frames=LASER_HIST, action_space=2)
    policy.cpu()  # policy.cuda() for gpu
    state_dict = torch.load(
        trained_model_file, map_location=torch.device(
            'cpu'))  #torch.load(trained_model_file) for gpu
    policy.load_state_dict(state_dict)

    rospy.init_node('rl_collision_avoidance_tb3', anonymous=False)
    print(
        '==================================\nrl_collision_avoidance node started'
    )

    nn_tb3 = NN_tb3(env,
                    policy,
                    action_bound,
                    OBS_SIZE,
                    index=0,
                    num_env=NUM_ENV)
    rospy.on_shutdown(nn_tb3.on_shutdown)

    rospy.spin()
Exemplo n.º 3
0
    logger_cal.setLevel(logging.INFO)
    cal_f_handler = logging.FileHandler(cal_file, mode='a')
    file_handler.setLevel(logging.INFO)
    logger_cal.addHandler(cal_f_handler)

    # logger.info('rosport: %d robotIndex: %d rank:%d' %(arg_list.rosports[rank],arg_list.robotIds[rank],rank))
    reward = None
    action_bound = [[0, -1], [1, 1]]

    # torch.manual_seed(1)
    # np.random.seed(1)
    if rank == 0:
        policy_path = policydir
        # policy_path = 'policy'
        # policy = MLPPolicy(obs_size, act_size)
        policy = CNNPolicy(frames=LASER_HIST, action_space=2)
        policy.cuda()
        opt = Adam(policy.parameters(), lr=LEARNING_RATE)
        mse = nn.MSELoss()

        if not os.path.exists(policy_path):
            os.makedirs(policy_path)

        file = policy_path + '/Stage1_%d' % ID_EP
        if os.path.exists(file):
            logger.info('####################################')
            logger.info('############Loading Model###########')
            logger.info('####################################')
            state_dict = torch.load(file)
            policy.load_state_dict(state_dict)
        else:
Exemplo n.º 4
0
class SAC(object):
    def __init__(self, num_inputs, action_space, param, action_bound):

        self.action_bound = action_bound
        self.alpha = param['alpha']
        self.gamma = param['gamma']
        self.tau = param['tau']
        self.target_update_interval = param['target_update_interval']
        self.automatic_entropy_tuning = param['automatic_entropy_tuning']
        self.lr = param['lr']

        self.device = torch.device("cuda" if param['cuda'] else "cpu")

        self.critic = QNetwork(num_inputs, action_space).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=self.lr)

        self.critic_target = QNetwork(num_inputs, action_space).to(self.device)
        hard_update(self.critic_target, self.critic)

        # Target Entropy = -dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
        if self.automatic_entropy_tuning is True:
            self.target_entropy = -torch.prod(
                torch.Tensor(action_space).to(self.device)).item()
            self.log_alpha = torch.zeros(1,
                                         requires_grad=True,
                                         device=self.device)
            self.alpha_optim = Adam([self.log_alpha], lr=param['lr'])

        self.policy = CNNPolicy(num_inputs, action_space).to(self.device)
        self.policy_optim = Adam(self.policy.parameters(), lr=param['lr'])

    def select_action(self, state, evaluate=False):
        #state = torch.FloatTensor(state).to(self.device).unsqueeze(0)
        obs_stack, s_list, speed = state
        obs_stack = Variable(torch.from_numpy(obs_stack)).float().cuda()
        goal_list = Variable(torch.from_numpy(goal_list)).float().cuda()
        speed = Variable(torch.from_numpy(speed)).float().cuda()

        if evaluate is False:
            action, _, _ = self.policy.sample(obs_stack, goal_list, speed)
        else:
            _, _, action = self.policy.sample(obs_stack, goal_list, speed)

        a = a.data.cpu().numpy()
        scaled_action = np.clip(a,
                                a_min=slef.action_bound[0],
                                a_max=self.action_bound[1])

        return scaled_action

    def update_parameters(self, memory, updates):
        state_batch, action_batch, reward_batch, next_state_batch, done_batch = memory
        obss, goals, speeds = state_batch
        obss_, goals_, speeds_ = next_state_batch

        # state_batch = torch.FloatTensor(state_batch).to(self.device)
        # next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)

        obss = torch.FloatTensor(obss).to(self.device)
        goals = torch.FloatTensor(goals).to(self.device)
        speeds = torch.FloatTensor(speeds).to(self.device)
        obss_ = torch.FloatTensor(obss_).to(self.device)
        goals_ = torch.FloatTensor(goals_).to(self.device)
        speeds_ = torch.FloatTensor(speeds_).to(self.device)

        action_batch = torch.FloatTensor(action_batch).to(self.device)
        reward_batch = torch.FloatTensor(reward_batch).to(
            self.device).unsqueeze(1)
        done_batch = torch.FloatTensor(done_batch).to(self.device).unsqueeze(1)

        with torch.no_grad():
            next_state_action, next_state_log_pi, _ = self.policy.sample(
                obss_, goals_, speeds_)
            qf1_next_target, qf2_next_target = self.critic_target(
                obss_, goals_, speeds_, next_state_action)
            min_qf_next_target = torch.min(
                qf1_next_target,
                qf2_next_target) - self.alpha * next_state_log_pi
            next_q_value = reward_batch + done_batch * self.gamma * (
                min_qf_next_target)
        qf1, qf2 = self.critic(
            obss, goals, speeds, action_batch
        )  # Two Q-functions to mitigate positive bias in the policy improvement step
        qf1_loss = F.mse_loss(qf1, next_q_value)
        qf2_loss = F.mse_loss(qf2, next_q_value)

        pi, log_pi, _ = self.policy.sample(obss, goals, speeds)

        qf1_pi, qf2_pi = self.critic(obss, goals, speeds, pi)
        min_qf_pi = torch.min(qf1_pi, qf2_pi)

        policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean()

        self.critic_optim.zero_grad()
        qf1_loss.backward()
        self.critic_optim.step()

        self.critic_optim.zero_grad()
        qf2_loss.backward()
        self.critic_optim.step()

        self.policy_optim.zero_grad()
        policy_loss.backward()
        self.policy_optim.step()

        if self.automatic_entropy_tuning:
            alpha_loss = -(self.log_alpha *
                           (log_pi + self.target_entropy).detach()).mean()

            self.alpha_optim.zero_grad()
            alpha_loss.backward()
            self.alpha_optim.step()

            self.alpha = self.log_alpha.exp()
            alpha_tlogs = self.alpha.clone()  # For TensorboardX logs
        else:
            alpha_loss = torch.tensor(0.).to(self.device)
            alpha_tlogs = torch.tensor(self.alpha)  # For TensorboardX logs

        if updates % self.target_update_interval == 0:
            soft_update(self.critic_target, self.critic, self.tau)

        return qf1_loss.item(), qf2_loss.item(), policy_loss.item(
        ), alpha_loss.item(), alpha_tlogs.item()

    def get_policy_state_dict(self):
        return self.policy.state_dict()

    def load_policy_state_dict(self, statedict):
        self.policy.load_state_dict(statedict)

    def load_model(self, actor_path, critic_path):
        if actor_path is not None:
            self.policy.load_state_dict(torch.load(actor_path))
        if critic_path is not None:
            self.critic.load_state_dict(torch.load(critic_path))

    def save_model(self,
                   env_name,
                   suffix="",
                   actor_path=None,
                   critic_path=None):
        if not os.path.exists('models/'):
            os.makedirs('models/')

        if actor_path is None:
            actor_path = "models/sac_actor_{}_{}".format(env_name, suffix)
        if critic_path is None:
            critic_path = "models/sac_critic_{}_{}".format(env_name, suffix)
        print('Saving models to {} and {}'.format(actor_path, critic_path))
        torch.save(self.policy.state_dict(), actor_path)
        torch.save(self.critic.state_dict(), critic_path)
Exemplo n.º 5
0
# Set parameters of env
LASER_HIST = 3
NUM_ENV = 10  # the number of agents in the environment
OBS_SIZE = 512  # number of leaserbeam
action_bound = [[0, -1], [1, 1]]  # the limitation of velocity

# Set env and agent policy
env = StageWorld(
    OBS_SIZE, index=0, num_env=NUM_ENV
)  #index is useful for parallel programming, 0 is for the first agent
env.reset_world()
env.reset_pose()

trained_model_file = 'policy/stage2.pth'
policy = CNNPolicy(frames=LASER_HIST, action_space=2)
policy.cpu()  # policy.cuda() for gpu
state_dict = torch.load(
    trained_model_file,
    map_location=torch.device('cpu'))  #torch.load(trained_model_file) for gpu
policy.load_state_dict(state_dict)

# Set fake obstacles by fake lidar info
obstacles_location = []
for i in range(NUM_ENV - 1):
    obstacles_location.append(np.zeros(512))
    obstacles_location[i][10 * i:10 * i + 10] = 0.1 * i + 0.05

# Configure agent as host
obs = env.get_laser_observation()
for i in range(NUM_ENV - 1):