示例#1
0
class fit_dist(object):
    def __init__(self, num_inputs, action_space, args):
        self.policy = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(device)
        self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

    def train(self, memory, batch_size):
        # Sample replay buffer / batch
        # state_np, action_np, reward_np, next_state_np, mask_np = memory.sample(batch_size=batch_size)
        state_np, next_state_np, action_np, reward_np, mask_np = memory.sample(batch_size=batch_size)
        state_batch = torch.FloatTensor(state_np).to(device)
        action_batch = torch.FloatTensor(action_np).to(device)

        log_prob = self.policy.lod_prob(state_batch, action_batch)
        loss = -log_prob.mean()

        self.policy_optim.zero_grad()
        loss.backward()
        self.policy_optim.step()

        return loss.item()

    # Save model parameters
    def save_model(self, buffer_type, env_name, suffix="", actor_path=None):
        if not os.path.exists('models/'):
            os.makedirs('models/')

        if actor_path is None:
            actor_path = "models/{}_SAC_prior_{}_{}".format(buffer_type, env_name, suffix)
        print('Saving models to {}'.format(actor_path))
        torch.save(self.policy.state_dict(), actor_path)
示例#2
0
文件: bear.py 项目: ryanxhr/BEAR
    def __init__(self, num_inputs, action_space, args):
        self.gamma = args.gamma
        self.tau = args.tau
        self.device = torch.device("cuda" if args.cuda else "cpu")

        self.critic = QNetwork(num_inputs, action_space.shape[0],
                               args.hidden_size).to(self.device)
        self.critic_optim = Adam(self.critic.parameters(), weight_decay=1e-2)

        self.critic_target = QNetwork(num_inputs, action_space.shape[0],
                                      args.hidden_size).to(self.device)
        hard_update(self.critic_target, self.critic)

        self.policy = GaussianPolicy(num_inputs, action_space.shape[0],
                                     args.hidden_size,
                                     action_space).to(self.device)
        self.policy_optim = Adam(self.policy.parameters(), lr=1e-4)

        self.policy_target = GaussianPolicy(num_inputs, action_space.shape[0],
                                            args.hidden_size,
                                            action_space).to(self.device)
        hard_update(self.policy_target, self.policy)

        self.dual_lambda = args.init_dual_lambda
        self.dual_step_size = args.dual_step_size
        self.cost_epsilon = args.cost_epsilon
        self.coefficient_weight = args.coefficient_weight
        self.dual_steps = args.dual_steps
        self.dirac_policy_num = args.dirac_policy_num
        self.m = args.m
        self.n = args.n
        self.mmd_before_tanh = args.mmd_before_tanh
示例#3
0
    def __init__(self, num_inputs, action_space, args):
        self.gamma = args.gamma
        self.tau = args.tau

        self.critic = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(device)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        self.critic_target = QNetwork(num_inputs, action_space.shape[0], args.hidden_size).to(device)
        hard_update(self.critic_target, self.critic)

        self.policy = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(device)
        self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

        self.policy_target = GaussianPolicy(num_inputs, action_space.shape[0], args.hidden_size, action_space).to(device)
        hard_update(self.policy_target, self.policy)

        # dual_lambda
        self.dual_lambda = args.init_dual_lambda
        self.dual_step_size = args.dual_step_size
        self.cost_epsilon = args.cost_epsilon

        # coefficient_weight assigned to ensemble variance term
        self.coefficient_weight = args.coefficient_weight

        self.dual_grad_times = args.dual_grad_times
示例#4
0
    def __init__(self, args, env_params):
        self.o_dim = env_params['o_dim']
        self.a_dim = env_params['a_dim']
        self.g_dim = env_params['g_dim']
        self.action_boundary = env_params['action_boundary']
        self.max_episode_steps = env_params['max_episode_steps']

        self.evaluate_episodes = args.evaluate_episodes
        self.lr_pi = args.lr_pi
        self.lr_v = args.lr_v
        self.gamma = args.gamma
        self.lamda = args.lamda
        self.action_var = args.action_var
        self.clip_range = args.clip_range
        self.temperature_coef = args.temperature_coef
        self.K_updates = args.K_updates
        self.device = torch.device(args.device)
        
        self.load_model_remark = args.load_model_remark

        self.total_trained_goal_num = 0
        self.total_episode_num = 0
        self.total_update_num = 0

        self.buffer = TrajectoryBuffer(self.max_episode_steps, self.o_dim, self.g_dim, self.a_dim)

        self.policy = GaussianPolicy(self.o_dim, self.g_dim, self.a_dim, self.action_var, self.device).to(self.device)
        self.policy_old = GaussianPolicy(self.o_dim, self.g_dim, self.a_dim, self.action_var, self.device).to(self.device)
        self.V = VFunction(self.o_dim, self.g_dim).to(self.device)
        self.V_old = VFunction(self.o_dim, self.g_dim).to(self.device)

        self.optimizer_pi = optim.Adam(self.policy.parameters(), lr=self.lr_pi)
        self.optimizer_v = optim.Adam(self.V.parameters(), lr=self.lr_v)

        self.hard_update()
示例#5
0
    def __init__(self):

        self.gamma = 0.99
        self.tau = 0.005
        self.alpha = 0.2
        self.lr = 0.003

        self.target_update_interval = 1
        self.device = torch.device("cpu")

        # 8 phases
        self.num_inputs = 8
        self.num_actions = 1
        self.hidden_size = 256

        self.critic = QNetwork(self.num_inputs, self.num_actions,
                               self.hidden_size).to(self.device)
        self.critic_optimizer = Adam(self.critic.parameters(), lr=self.lr)

        self.critic_target = QNetwork(self.num_inputs, self.num_actions,
                                      self.hidden_size).to(self.device)
        hard_update(self.critic_target, self.critic)
        # Copy the parameters of critic to critic_target

        self.target_entropy = -torch.Tensor([1.0]).to(self.device).item()
        self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)

        self.alpha_optimizer = Adam([self.log_alpha], lr=self.lr)

        self.policy = GaussianPolicy(self.num_inputs, self.num_actions,
                                     self.hidden_size).to(self.device)
        self.policy_optimizer = Adam(self.policy.parameters(), lr=self.lr)
示例#6
0
 def __init__(self, num_inputs, action_space, args):
     self.device = torch.device("cpu")
     self.policy = GaussianPolicy(num_inputs, action_space.shape[0],
                                  args.hidden_size,
                                  action_space).to(self.device)
     self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
     self.genbuffer_algo = args.genbuffer_algo
示例#7
0
文件: agent.py 项目: Kavka1/RL
    def __init__(self, args, env_params):
        self.o_dim = env_params['o_dim']
        self.a_dim = env_params['a_dim']
        self.action_scale = np.array(env_params['action_scale'],
                                     dtype=np.float32)
        self.action_bias = np.array(env_params['action_bias'],
                                    dtype=np.float32)
        self.action_boundary = env_params['action_boundary']
        self.device = torch.device(args.device)

        self.lr = args.lr
        self.gamma = args.gamma
        self.tau = args.tau
        self.alpha = args.alpha
        self.memory_size = args.memory_size
        self.batch_size = args.batch_size
        self.target_update_interval = args.target_update_interval

        self.memory = Memory(self.memory_size, self.o_dim, self.a_dim)

        self.policy = GaussianPolicy(self.o_dim, self.a_dim).to(self.device)
        self.critic = TwinQFunction(self.o_dim, self.a_dim).to(self.device)
        self.critic_target = TwinQFunction(self.o_dim,
                                           self.a_dim).to(self.device)

        self.target_entropy = -torch.prod(
            torch.Tensor(self.a_dim).to(self.device)).item()
        self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)

        self.optimizer_pi = optim.Adam(self.policy.parameters(), lr=self.lr)
        self.optimizer_q = optim.Adam(self.critic.parameters(), lr=self.lr)
        self.optimizer_alpha = optim.Adam([self.log_alpha], lr=self.lr)

        self.hard_update_target()
示例#8
0
文件: sac.py 项目: dmitrySorokin/SAC
    def __init__(self, num_inputs, action_space, args):

        self.gamma = args.gamma
        self.tau = args.tau
        self.alpha = args.alpha

        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning

        self.device = torch.device("cuda" if args.cuda else "cpu")

        self.critic = QNetwork(num_inputs, action_space.shape[0],
                               args.hidden_size).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        self.critic_target = QNetwork(num_inputs, action_space.shape[0],
                                      args.hidden_size).to(self.device)
        hard_update(self.critic_target, self.critic)

        # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
        if self.automatic_entropy_tuning is True:
            self.target_entropy = -torch.prod(
                torch.Tensor(action_space.shape).to(self.device)).item()
            self.log_alpha = torch.zeros(1,
                                         requires_grad=True,
                                         device=self.device)
            self.alpha_optim = Adam([self.log_alpha], lr=args.lr)

        self.policy = GaussianPolicy(num_inputs, action_space.shape[0],
                                     args.hidden_size,
                                     action_space).to(self.device)
        self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
示例#9
0
    def __init__(self, num_inputs, action_space, args):

        self.num_inputs = num_inputs
        self.action_space = action_space.shape[0]
        self.gamma = args.gamma
        self.tau = args.tau
        self.scale_R = args.scale_R
        self.reparam = args.reparam
        self.deterministic = args.deterministic
        self.target_update_interval = args.target_update_interval

        self.policy = GaussianPolicy(self.num_inputs, self.action_space,
                                     args.hidden_size)
        self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

        self.critic = QNetwork(self.num_inputs, self.action_space,
                               args.hidden_size)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        if self.deterministic == False:
            self.value = ValueNetwork(self.num_inputs, args.hidden_size)
            self.value_target = ValueNetwork(self.num_inputs, args.hidden_size)
            self.value_optim = Adam(self.value.parameters(), lr=args.lr)
            hard_update(self.value_target, self.value)
            self.value_criterion = nn.MSELoss()
        else:
            self.critic_target = QNetwork(self.num_inputs, self.action_space,
                                          args.hidden_size)
            hard_update(self.critic_target, self.critic)

        self.soft_q_criterion = nn.MSELoss()
    def __init__(self, num_inputs, action_space, \
                 device, hidden_size, seed, lr, gamma, tau, alpha):

        self.gamma = gamma
        self.tau = tau
        self.alpha = alpha

        self.device = device
        self.seed = seed
        self.seed = torch.manual_seed(seed)

        torch.cuda.manual_seed(seed)
        #torch.cuda.manual_seed_all(seed)
        #torch.backends.cudnn.deterministic=True

        self.critic = QNetwork(seed, num_inputs, action_space.shape[0],
                               hidden_size).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=lr)

        self.critic_target = QNetwork(seed, num_inputs, action_space.shape[0],
                                      hidden_size).to(self.device)
        hard_update(self.critic_target, self.critic)

        # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
        self.target_entropy = -torch.prod(
            torch.Tensor(action_space.shape).to(self.device)).item()
        self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
        self.alpha_optim = Adam([self.log_alpha], lr=lr)
        self.policy = GaussianPolicy(seed, num_inputs, action_space.shape[0], \
                                         hidden_size, action_space).to(self.device)
        self.policy_optim = Adam(self.policy.parameters(), lr=lr)
示例#11
0
    def __init__(self):
        #Creating environment
        self.env = gym.make(settings.env_name)
        self.env.seed(settings.seed)
        self.env.action_space.seed(settings.seed)

        self.state_space = self.env.observation_space.shape[0]
        self.action_space = self.env.action_space.shape[0]

        self.obs_normalizer = Normalizer(self.state_space)

        self.device = torch.device(settings.device)
        self.writer = SummaryWriter(
            'runs/' + settings.env_name + "_" + settings.algo +
            '_{}_{}_{}'.format(p.alpha, p.ex_alpha, settings.seed))

        #Initializing common networks and their optimizers
        self.exploitory_policy = GaussianPolicy(
            self.state_space, self.action_space).to(self.device)
        self.exploitory_Q = QNet(self.state_space,
                                 self.action_space).to(self.device)
        self.exploitory_Q_target = QNet(self.state_space,
                                        self.action_space).to(self.device)
        self.exploitory_policy_optim = Adam(
            self.exploitory_policy.parameters(), lr=p.lr)
        self.exploitory_Q_optim = Adam(self.exploitory_Q.parameters(), lr=p.lr)

        self.target_update(self.exploitory_Q_target, self.exploitory_Q, 1.0)

        p.alpha = torch.Tensor([p.alpha]).to(self.device)
        if settings.automatic_entropy_tuning:
            self.target_entropy = -torch.prod(
                torch.Tensor(self.env.action_space.shape).to(
                    self.device)).item()
            self.log_alpha = torch.zeros(1,
                                         requires_grad=True,
                                         device=self.device)
            self.alpha_optim = Adam([self.log_alpha], lr=p.lr)

        if settings.automatic_ex_entropy_tuning:
            self.ex_target_entropy = -torch.prod(
                torch.Tensor(self.env.action_space.shape).to(
                    self.device)).item()
            self.ex_log_alpha = torch.zeros(1,
                                            requires_grad=True,
                                            device=self.device)
            self.ex_alpha_optim = Adam([self.log_alpha], lr=p.lr)

        if settings.reward_model == 'novelty':
            self.ex_reward_model = Novelty(self.state_space, self.device)
示例#12
0
def policy_factory(model_path, env):
    agent = GaussianPolicy(env.observation_space.shape[0],
                           env.action_space.shape[0], 256,
                           env.action_space).to('cpu')

    agent.load_state_dict(
        torch.load(model_path, map_location=torch.device('cpu')))

    def policy(obs):
        state = torch.FloatTensor(obs).unsqueeze(0)
        _, _, action = agent.sample(state)
        return action.detach().cpu().numpy()[0]

    return policy
def main():
    # create config
    config = Config()
    config.game = args.game
    config.algo = 'ppo'
    config.max_steps = int(2e6)
    config.num_envs = 1
    config.optimizer = 'RMSprop'
    config.lr = 0.0003
    config.discount = 0.99
    config.use_gae = True
    config.gae_lambda = 0.95
    config.use_grad_clip = True
    config.max_grad_norm = 0.5
    config.rollout_length = 2048
    config.value_loss_coef = 0.5
    config.entropy_coef = 0
    config.ppo_epoch = 10
    config.ppo_clip_param = 0.2
    config.num_mini_batch = 32
    config.use_gpu = True
    config.seed = args.seed
    config.num_frame_stack = 1
    config.after_set()
    print(config)

    # prepare env, model and logger
    env = make_vec_envs(config.game, num_envs = config.num_envs, seed = config.seed, num_frame_stack= config.num_frame_stack)
    model = GaussianPolicy(env.observation_space.shape[0], action_dim = get_action_dim(env.action_space)).to(config.device)
    logger =  Logger(SummaryWriter(config.save_path), config.num_echo_episodes)

    # create agent and run
    agent = PPOAgent(config, env, model, logger)
    agent.run()
示例#14
0
    def __init__(self, state_shape, n_actions, args):

        self.gamma = args.gamma
        self.tau = args.tau
        self.alpha = args.alpha
        self.action_range = [0.0, 1.0]

        self.policy_type = args.policy
        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning

        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        self.critic = QNetwork(state_shape, n_actions, args.hidden_size).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        self.critic_target = QNetwork(state_shape, n_actions, args.hidden_size).to(self.device)
        hard_update(self.critic_target, self.critic)

        if self.policy_type == "Gaussian":
            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
            if self.automatic_entropy_tuning:
                self.target_entropy = -torch.prod(torch.Tensor(n_actions).to(self.device)).item()
                self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
                self.alpha_optim = Adam([self.log_alpha], lr=args.lr)


            self.policy = GaussianPolicy(state_shape, n_actions, args.hidden_size).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

        else:
            self.alpha = 0
            self.automatic_entropy_tuning = False
            self.policy = DeterministicPolicy(state_shape, n_actions, args.hidden_size).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
示例#15
0
def run():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env_id', type=str, default='AntBulletEnv-v0')
    parser.add_argument('--log_name', type=str, default='')
    parser.add_argument('--cuda', action='store_true')
    parser.add_argument('--seed', type=int, default=0)
    args = parser.parse_args()

    if args.log_name:
        log_dir = os.path.join('logs', args.env_id, args.log_name)
    else:
        env_dir = os.path.join('logs', args.env_id, '*')
        dirs = glob.glob(env_dir)
        log_dir = max(dirs, key=os.path.getctime)
        print(f'using {log_dir}')

    env = gym.make(args.env_id)
    device = torch.device(
        "cuda" if args.cuda and torch.cuda.is_available() else "cpu")

    policy = GaussianPolicy(
        env.observation_space.shape[0],
        env.action_space.shape[0],
        hidden_units=[256, 256]).to(device)

    policy.load(os.path.join(log_dir, 'model', 'policy.pth'))
    grad_false(policy)

    def exploit(state):
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        with torch.no_grad():
            _, _, action = policy.sample(state)
        return action.cpu().numpy().reshape(-1)

    env.render()
    while True:
        state = env.reset()
        episode_reward = 0.
        done = False
        while not done:
            env.render()
            action = exploit(state)
            next_state, reward, done, _ = env.step(action)
            episode_reward += reward
            state = next_state
        print(f'total reward: {episode_reward}')
        time.sleep(1)
示例#16
0
文件: population.py 项目: Kavka1/RL
    def __init__(self, id, o_dim, a_dim, action_bound, device, eval_episode):
        self.id = id
        self.fitness = 0.
        self.eval_episode = eval_episode
        self.device = device
        self.action_bound = action_bound

        #self.actor = DeterministicPolicy(o_dim, a_dim).to(device)
        self.actor = GaussianPolicy(o_dim, a_dim).to(self.device)
示例#17
0
    def __init__(self):
        super(Off_policy, self).__init__()
        self.memory = Replay_buffer(capacity=p.exploitory_policy_memory_size)
        self.exploratory_policy = GaussianPolicy(
            self.state_space, self.action_space).to(self.device)
        self.exploratory_Q = QNet(self.state_space,
                                  self.action_space).to(self.device)
        self.exploratory_Q_target = QNet(self.state_space,
                                         self.action_space).to(self.device)
        self.exploratory_policy_optim = Adam(
            self.exploratory_policy.parameters(), lr=p.lr)
        self.exploratory_Q_optim = Adam(self.exploratory_Q.parameters(),
                                        lr=p.lr)

        self.target_update(self.exploratory_policy, self.exploitory_policy,
                           1.0)

        self.kl_normalizer = Normalizer(1)
        self.ex_rewards_normalizer = Normalizer(1)
示例#18
0
    def __init__(self, num_inputs, action_space, agent_args):

        self.gamma = agent_args["gamma"]
        self.tau = agent_args["tau"]
        self.alpha = agent_args["alpha"]

        self.policy_type = agent_args["policy"]
        self.target_update_interval = agent_args["target_update_interval"]
        self.automatic_entropy_tuning = agent_args["automatic_entropy_tuning"]

        self.device = torch.device("cuda" if agent_args["cuda"] else "cpu")

        # print("num_inputs::",num_inputs)
        # print("type(action_space)::",type(action_space))
        # print("type(action_space)::",isinstance(action_space,gym.spaces.discrete.Discrete))
        # print(" agent_args['hidden_size']::", agent_args["hidden_size"])

        if isinstance(action_space, gym.spaces.discrete.Discrete):
            action_shape = action_space.n
        else:
            action_shape = action_space.shape[0]
        self.critic = QNetwork(
            num_inputs, action_shape,
            agent_args["hidden_size"]).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=agent_args["lr"])

        self.critic_target = QNetwork(num_inputs, action_shape,
                                      agent_args["hidden_size"]).to(
                                          self.device)
        hard_update(self.critic_target, self.critic)

        if self.policy_type == "Gaussian":
            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
            if self.automatic_entropy_tuning == True:
                self.target_entropy = -torch.prod(
                    torch.Tensor(action_space.shape).to(self.device)).item()
                self.log_alpha = torch.zeros(1,
                                             requires_grad=True,
                                             device=self.device)
                self.alpha_optim = Adam([self.log_alpha], lr=agent_args["lr"])

            self.policy = GaussianPolicy(num_inputs, action_shape,
                                         agent_args["hidden_size"],
                                         action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(),
                                     lr=agent_args["lr"])

        else:
            self.alpha = 0
            self.automatic_entropy_tuning = False
            self.policy = DeterministicPolicy(num_inputs, action_shape,
                                              agent_args["hidden_size"],
                                              action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(),
                                     lr=agent_args["lr"])
示例#19
0
    def __init__(self, args, env_params):
        self.o_dim = env_params['o_dim']
        self.a_dim = env_params['a_dim']
        self.g_dim = env_params['g_dim']
        self.action_boundary = env_params['action_boundary']
        self.max_episode_steps = env_params['max_episode_steps']

        self.evaluate_episodes = args.evaluate_episodes
        self.lr_pi = args.lr_pi_TD3
        self.lr_q = args.lr_q
        self.gamma = args.gamma
        self.tau = args.tau
        self.action_var = args.action_var
        self.noise_std = args.noise_std
        self.noise_clip = args.noise_clip
        self.K_updates = args.K_updates_TD3
        self.policy_update_interval = args.policy_update_interval
        self.batch_size = args.batch_size
        self.device = torch.device(args.device)
        self.load_model_remark = args.load_model_remark

        self.total_trained_goal_num = 0
        self.total_episode_num = 0
        self.total_update_num = 0
        self.policy_loss_log = 0.
        self.q1_loss_log = 0.
        self.q2_loss_log = 0.

        self.memory = MemoryBuffer(args.memory_capacity, self.o_dim, self.g_dim, self.a_dim)

        self.policy = GaussianPolicy(self.o_dim, self.g_dim, self.a_dim, self.action_var, self.device).to(self.device)
        self.policy_target = GaussianPolicy(self.o_dim, self.g_dim, self.a_dim, self.action_var, self.device).to(self.device)
        self.Q1 = QFunction(self.o_dim, self.g_dim, self.a_dim).to(self.device)
        self.Q1_target = QFunction(self.o_dim, self.g_dim, self.a_dim).to(self.device)
        self.Q2 = QFunction(self.o_dim, self.g_dim, self.a_dim).to(self.device)
        self.Q2_target = QFunction(self.o_dim, self.g_dim, self.a_dim).to(self.device)

        self.optimizer_pi = optim.Adam(self.policy.parameters(), lr=self.lr_pi)
        self.optimizer_q1 = optim.Adam(self.Q1.parameters(), lr=self.lr_q)
        self.optimizer_q2 = optim.Adam(self.Q2.parameters(), lr=self.lr_q)

        self.hard_update()
示例#20
0
def testing():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env_name', type=str, default='HalfCheetah-v2')
    parser.add_argument('--num_episode', type=int, default=10)
    args = parser.parse_args()

    num_episode = args.num_episode

    env = gym.make(args.env_name)
    device = torch.device(
        "cuda" if torch.cuda.is_available() else "cpu")

    policy = GaussianPolicy(
        env.observation_space.shape[0],
        env.action_space.shape[0],
        hidden_units=[256, 256]).to(device)

    policy.load(os.path.join('models', args.env_name, 'policy.pth'))
    grad_false(policy)

    def exploit(state):
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        with torch.no_grad():
            _, _, action = policy.sample(state)
        return action.cpu().numpy().reshape(-1)

    e_rewrads = []
    for _ in range(num_episode):
        state = env.reset()
        episode_reward = 0.
        done = False
        while not done:
            if num_episode <= 1:
                env.render()
            action = exploit(state)
            next_state, reward, done, _ = env.step(action)
            episode_reward += reward
            state = next_state
        e_rewrads.append(episode_reward)
    print("Average reward of " + args.env_name + " is %.1f"%(np.mean(e_rewrads)))
    print("Average std of " + args.env_name + " is %.1f"%(np.std(e_rewrads)))
示例#21
0
文件: sac.py 项目: i-am-neet/mapf-sim
    def __init__(self, input_space, action_space, args):

        self.use_expert = args.use_expert
        self.gamma = args.gamma
        self.tau = args.tau
        self.alpha = args.alpha
        self.action_range = [action_space.low, action_space.high]
        self.policy_type = args.policy

        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning

        # self.device = torch.device("cuda" if args.cuda else "cpu")
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        # print(torch.cuda.is_available())
        # print(torch.cuda.current_device())
        # print(torch.cuda.device(0))
        # print(torch.cuda.device_count())
        # print(torch.cuda.get_device_name())
        # print(torch.backends.cudnn.version())
        # print(torch.backends.cudnn.is_available())

        self.critic = QNetwork(input_space, action_space.shape[0], args.hidden_size).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        self.critic_target = QNetwork(input_space, action_space.shape[0], args.hidden_size).to(self.device)
        hard_update(self.critic_target, self.critic)

        if self.policy_type == "Gaussian":
            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
            if self.automatic_entropy_tuning is True:
                self.target_entropy = -torch.prod(torch.Tensor(action_space.shape).to(self.device)).item()
                self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
                self.alpha_optim = Adam([self.log_alpha], lr=args.lr)

            self.policy = GaussianPolicy(input_space, action_space.shape[0], args.hidden_size, action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

        else:
            raise ValueError("Not supper another type yet.")
示例#22
0
 def __init__(self, action_size, state_size, config):
     self.action_size = action_size
     self.state_size = state_size
     self.min_action = config["min_action"]
     self.max_action = config["max_action"]
     self.seed = config["seed"]
     self.tau = config["tau"]
     self.gamma = config["gamma"]
     self.batch_size = config["batch_size"]
     if not torch.cuda.is_available():
         config["device"] == "cpu"
     self.device = config["device"]
     self.eval = config["eval"]
     torch.manual_seed(self.seed)
     np.random.seed(self.seed)
     self.vid_path = config["vid_path"]
     print("actions size ", action_size)
     print("actions min ", self.min_action)
     print("actions max ", self.max_action)
     self.critic = QNetwork(state_size, action_size, config["fc1_units"], config["fc2_units"]).to(self.device)
     self.q_optim = torch.optim.Adam(self.critic.parameters(), config["lr_critic"])
     self.target_critic = QNetwork(state_size, action_size, config["fc1_units"], config["fc2_units"]).to(self.device)
     self.target_critic.load_state_dict(self.critic.state_dict())
     self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
     self.alpha = self.log_alpha.exp()
     self.alpha_optim = Adam([self.log_alpha], lr=config["lr_alpha"])
     #self.policy = SACActor(state_size, action_size).to(self.device)
     self.policy = GaussianPolicy(state_size, action_size, 256).to(self.device)
     self.policy_optim = Adam(self.policy.parameters(), lr=config["lr_policy"])
     self.max_timesteps = config["max_episodes_steps"]
     self.episodes = config["episodes"]
     self.memory = ReplayBuffer((state_size, ), (action_size, ), config["buffer_size"], self.device)
     pathname = config["seed"]
     tensorboard_name = str(config["res_path"]) + '/runs/' + str(pathname)
     self.writer = SummaryWriter(tensorboard_name)
     self.steps= 0
     self.target_entropy = -torch.prod(torch.Tensor(action_size).to(self.device)).item()
示例#23
0
    def __init__(self, num_inputs, action_space, args):
        #self.n_flow = args.n_flows
        #assert self.n_flow == 0
        self.num_inputs = num_inputs
        #self.flow_family = args.flow_family
        self.num_layers = args.num_layers
        self.args = args

        self.gamma = args.gamma
        self.tau = args.tau
        self.alpha = args.alpha

        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning

        self.device = torch.device("cuda" if args.cuda else "cpu")

        self.critic = QNetwork(num_inputs, action_space.shape[0],
                               args.hidden_size).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        self.critic_target = QNetwork(num_inputs, action_space.shape[0],
                                      args.hidden_size).to(self.device)
        hard_update(self.critic_target, self.critic)

        if self.automatic_entropy_tuning:
            self.target_entropy = -torch.prod(
                torch.Tensor(action_space.shape).to(self.device)).item()
            self.log_alpha = torch.zeros(1,
                                         requires_grad=True,
                                         device=self.device)
            self.alpha_optim = Adam([self.log_alpha], lr=args.lr)

        self.policy = GaussianPolicy(num_inputs, action_space.shape[0],
                                     args.hidden_size, self.num_layers,
                                     args).to(self.device)
        self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
示例#24
0
def run():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env_id', type=str, default='HalfCheetah-v2')
    parser.add_argument('--log_name', type=str, default='sac-seed0-datetime')
    parser.add_argument('--cuda', action='store_true')
    parser.add_argument('--seed', type=int, default=0)
    args = parser.parse_args()

    log_dir = os.path.join('logs', args.env_id, args.log_name)

    env = gym.make(args.env_id)
    device = torch.device(
        "cuda" if args.cuda and torch.cuda.is_available() else "cpu")

    policy = GaussianPolicy(env.observation_space.shape[0],
                            env.action_space.shape[0],
                            hidden_units=[256, 256]).to(device)

    policy.load(os.path.join(log_dir, 'model', 'policy.pth'))
    grad_false(policy)

    def exploit(state):
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        with torch.no_grad():
            _, _, action = policy.sample(state)
        return action.cpu().numpy().reshape(-1)

    state = env.reset()
    episode_reward = 0.
    done = False
    while not done:
        env.render()
        action = exploit(state)
        next_state, reward, done, _ = env.step(action)
        episode_reward += reward
        state = next_state
示例#25
0
    def __init__(self, num_inputs, action_space, config):

        self.gamma = config['gamma']
        self.tau = config['tau']
        self.alpha = config['alpha']

        self.policy_type = config['policy']
        self.target_update_interval = config['target_update_interval']
        self.automatic_entropy_tuning = config['automatic_entropy_tuning']

        self.device = torch.device(
            'cuda:' + str(config['cuda'])) if torch.cuda.is_available(
            ) and config['cuda'] >= 0 else torch.device('cpu')

        self.critic = QNetwork(num_inputs, action_space.shape[0],
                               config['hidden_size']).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=config['lr'])

        self.critic_target = QNetwork(num_inputs, action_space.shape[0],
                                      config['hidden_size']).to(self.device)
        hard_update(self.critic_target, self.critic)

        if self.policy_type == "Gaussian":
            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
            if self.automatic_entropy_tuning == True:
                self.target_entropy = -torch.prod(
                    torch.Tensor(action_space.shape).to(self.device)).item()
                self.log_alpha = torch.zeros(1,
                                             requires_grad=True,
                                             device=self.device)
                self.alpha_optim = Adam([self.log_alpha], lr=config['lr'])

            self.policy = GaussianPolicy(num_inputs, action_space.shape[0],
                                         config['hidden_size'],
                                         action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=config['lr'])
示例#26
0
    def __init__(self, num_inputs, action_space, args):

        self.gamma = args.gamma
        self.tau = args.tau
        self.alpha = args.alpha

        self.policy_type = args.policy
        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning

        self.device = torch.device("cuda" if args.cuda else "cpu")

        # Q network, which yields a certain value for (a_t | s_t) pair
        self.critic = QNetwork(num_inputs, action_space.shape[0],
                               args.hidden_size).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        # a sort of a replica - since, due to Bellman recursive definition, Q network learns from itself- and its unstbale
        self.critic_target = QNetwork(num_inputs, action_space.shape[0],
                                      args.hidden_size).to(self.device)
        # the start point is same weights in both networks.
        hard_update(self.critic_target, self.critic)

        if self.policy_type == "Gaussian":
            # todo: crunch on this automatic alpha update
            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
            if self.automatic_entropy_tuning == True:
                self.target_entropy = -torch.prod(
                    torch.Tensor(action_space.shape).to(self.device)).item()
                self.log_alpha = torch.zeros(1,
                                             requires_grad=True,
                                             device=self.device)
                self.alpha_optim = Adam([self.log_alpha], lr=args.lr)

            # instanciating of policy - given a state it produces probabilities for actions
            self.policy = GaussianPolicy(num_inputs, action_space.shape[0],
                                         args.hidden_size).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

        else:
            self.alpha = 0
            self.automatic_entropy_tuning = False
            # todo: what's difference between deterministic to Gaussian
            self.policy = DeterministicPolicy(num_inputs,
                                              action_space.shape[0],
                                              args.hidden_size).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
示例#27
0
    def __init__(self, num_inputs, action_space, args):

        self.gamma = args.gamma  #γ
        self.tau = args.tau  #τ
        self.alpha = args.alpha  #α

        self.policy_type = args.policy  #策略类型,高斯随机策略、确定性策略
        self.target_update_interval = args.target_update_interval  #target network更新间隔
        self.automatic_entropy_tuning = args.automatic_entropy_tuning  #自动调熵

        self.device = torch.device("cuda" if args.cuda else "cpu")

        self.critic = QNetwork(num_inputs,
                               action_space.shape[0], args.hidden_size).to(
                                   device=self.device)  #Critic Network,Q网络
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        self.critic_target = QNetwork(num_inputs, action_space.shape[0],
                                      args.hidden_size).to(
                                          self.device)  #Target Q Network
        hard_update(self.critic_target, self.critic)

        if self.policy_type == "Gaussian":
            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
            if self.automatic_entropy_tuning == True:
                self.target_entropy = -torch.prod(
                    torch.Tensor(action_space.shape).to(
                        self.device)).item()  #torch.prod(input) : 返回所有元素的乘积
                self.log_alpha = torch.zeros(1,
                                             requires_grad=True,
                                             device=self.device)
                self.alpha_optim = Adam([self.log_alpha], lr=args.lr)

            self.policy = GaussianPolicy(num_inputs, action_space.shape[0],
                                         args.hidden_size,
                                         action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

        else:
            self.alpha = 0
            self.automatic_entropy_tuning = False
            self.policy = DeterministicPolicy(num_inputs,
                                              action_space.shape[0],
                                              args.hidden_size,
                                              action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
示例#28
0
    def __init__(self, num_inputs, action_space, args):

        self.gamma = args.gamma
        self.tau = args.tau
        self.alpha = args.alpha

        self.policy_type = args.policy
        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning

        self.device = torch.device("cuda" if args.cuda else "cpu")

        #Similar to Double-QNetwork
        self.critic = QNetwork(num_inputs, action_space.shape[0],
                               args.hidden_size).to(device=self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        self.critic_target = QNetwork(num_inputs, action_space.shape[0],
                                      args.hidden_size).to(self.device)
        hard_update(self.critic_target, self.critic)
        #The two networks are with the same initialization

        #Two option policy, stochastic(Gaussian) or Deterministic
        if self.policy_type == "Gaussian":
            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
            if self.automatic_entropy_tuning is True:
                self.target_entropy = -torch.prod(
                    torch.Tensor(action_space.shape).to(self.device)).item()
                self.log_alpha = torch.zeros(1,
                                             requires_grad=True,
                                             device=self.device)
                self.alpha_optim = Adam([self.log_alpha], lr=args.lr)

            self.policy = GaussianPolicy(num_inputs, action_space.shape[0],
                                         args.hidden_size,
                                         action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

        else:
            self.alpha = 0
            self.automatic_entropy_tuning = False
            self.policy = DeterministicPolicy(num_inputs,
                                              action_space.shape[0],
                                              args.hidden_size,
                                              action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)
示例#29
0
    def __init__(self, num_inputs, action_space, args):

        self.num_inputs = num_inputs
        self.action_space = action_space.shape[0]
        self.gamma = args.gamma
        self.tau = args.tau

        self.policy_type = args.policy
        self.target_update_interval = args.target_update_interval
        self.automatic_entropy_tuning = args.automatic_entropy_tuning

        self.critic = QNetwork(self.num_inputs, self.action_space,
                               args.hidden_size)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.lr)

        if self.policy_type == "Gaussian":
            self.alpha = args.alpha
            # Target Entropy = −dim(A) (e.g. , -6 for HalfCheetah-v2) as given in the paper
            if self.automatic_entropy_tuning == True:
                self.target_entropy = -torch.prod(
                    torch.Tensor(action_space.shape)).item()
                self.log_alpha = torch.zeros(1, requires_grad=True)
                self.alpha_optim = Adam([self.log_alpha], lr=args.lr)
            else:
                pass

            self.policy = GaussianPolicy(self.num_inputs, self.action_space,
                                         args.hidden_size)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

            self.value = ValueNetwork(self.num_inputs, args.hidden_size)
            self.value_target = ValueNetwork(self.num_inputs, args.hidden_size)
            self.value_optim = Adam(self.value.parameters(), lr=args.lr)
            hard_update(self.value_target, self.value)
        else:
            self.policy = DeterministicPolicy(self.num_inputs,
                                              self.action_space,
                                              args.hidden_size)
            self.policy_optim = Adam(self.policy.parameters(), lr=args.lr)

            self.critic_target = QNetwork(self.num_inputs, self.action_space,
                                          args.hidden_size)
            hard_update(self.critic_target, self.critic)
示例#30
0
    def __init__(self, num_inputs, action_space, variant):

        self.gamma = variant['gamma']
        self.tau = variant['tau']
        self.alpha = variant['alpha']
        self.policy_type = variant['policy_type']
        self.target_update_interval = variant['target_update_interval']
        self.automatic_entropy_tuning = variant['automatic_entropy_tuning']
        self.lr = variant.get("lr", 1e-3)

        self.device = torch.device("cuda" if variant['cuda'] else "cpu")
        self.hidden_size = variant.get('hidden_size', [128, 128])

        self.critic = QNetwork(num_inputs, action_space.shape[0],
                               self.hidden_size).to(self.device)
        self.critic_optim = Adam(self.critic.parameters(), lr=self.lr)

        self.critic_target = QNetwork(num_inputs, action_space.shape[0],
                                      self.hidden_size).to(self.device)
        hard_update(self.critic_target, self.critic)

        if self.policy_type == 'Gaussian':
            if self.automatic_entropy_tuning:
                self.target_entropy = -torch.prod(
                    torch.Tensor(action_space.shape).to(self.device)).item()
                self.log_alpha = torch.zeros(1,
                                             requires_grad=True,
                                             device=self.device)
                self.alpha_optim = Adam([self.log_alpha], lr=self.lr)

            self.policy = GaussianPolicy(num_inputs, action_space.shape[0],
                                         self.hidden_size,
                                         action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=self.lr)

        else:
            self.alpha = 0
            self.automatic_entropy_tuning = False
            self.policy = DeterministicPolicy(num_inputs,
                                              action_space.shape[0],
                                              self.hidden_size,
                                              action_space).to(self.device)
            self.policy_optim = Adam(self.policy.parameters(), lr=self.lr)