示例#1
0
    def __init__(self, nb_state, nb_action):
        self.nb_state = nb_state
        self.nb_action = nb_action

        self.actor = Actor(self.nb_state, self.nb_action)
        self.actor_target = Actor(self.nb_state, self.nb_action)
        self.actor_optim = Adam(self.actor.parameters(), lr=LEARNING_RATE)

        self.critic = Critic(self.nb_state, self.nb_action)
        self.critic_target = Critic(self.nb_state, self.nb_action)
        self.critic_optim = Adam(self.critic.parameters(), lr=LEARNING_RATE)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        #Create replay buffer
        self.memory = SequentialMemory(limit=MEMORY_SIZE, window_length=1)
        self.random_process = OrnsteinUhlenbeckProcess(size=nb_action,
                                                       theta=OU_THETA,
                                                       mu=OU_MU,
                                                       sigma=OU_SIGMA)

        self.is_training = True
        self.epsilon = 1.0
        self.a_t = None
        self.s_t = None

        if USE_CUDA: self.cuda()
示例#2
0
    def __init__(self, nb_states, nb_actions):
        self.critic = Critic(nb_states, nb_actions)  # Q
        self.critic_target = Critic(nb_states, nb_actions)
        self.actor = Actor(nb_states, nb_actions)  # policy mu
        self.actor_target = Actor(nb_states, nb_actions)

        hard_update(self.critic_target, self.critic)
        hard_update(self.actor_target, self.actor)

        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=0.001)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=0.0001)

        self.criterion = nn.MSELoss()

        self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                                       theta=0.15,
                                                       mu=0,
                                                       sigma=0.2)

        self.gamma = 0.99
        self.batch_size = 64

        if USE_CUDA:
            self.actor.cuda()
            self.actor_target.cuda()
            self.critic.cuda()
            self.critic_target.cuda()
示例#3
0
    def __init__(self,
                 state_size,
                 action_size,
                 memory_size,
                 batch_size=128,
                 tan=0.001,
                 actor_lr=0.001,
                 critic_lr=0.001,
                 epsilon=1.):

        self.state_size = state_size
        self.action_size = action_size
        self.batch_size = batch_size
        self.tan = tan
        self.warmup = WARM_UP
        self.epsilon = epsilon
        self.epsilon_decay = hyperparameters['D_EPSILON']

        self.actor = Actor(state_size, action_size)
        self.actor_target = Actor(state_size, action_size)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr)
        self.critic = Critic(state_size, action_size)
        self.critic_target = Critic(state_size, action_size)
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=critic_lr)
        self.memory = Memory(memory_size)
        self.criterion = nn.MSELoss()

        self.random_process = OrnsteinUhlenbeckProcess(size=action_size,
                                                       theta=0.15,
                                                       mu=0.,
                                                       sigma=0.2)

        copy_parameter(self.actor, self.actor_target)
        copy_parameter(self.critic, self.critic_target)
示例#4
0
    def __init__(self, s_dim, a_dim, num_agent, **kwargs):
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.config = kwargs['config']
        self.num_agent = num_agent

        self.actor = Actor(s_dim, a_dim)
        self.actor_target = Actor(s_dim, a_dim)
        self.critic = Critic(s_dim, a_dim, num_agent)
        self.critic_target = Critic(s_dim, a_dim, num_agent)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=self.config.a_lr)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=self.config.c_lr)
        self.a_loss = 0
        self.c_loss = 0

        if self.config.use_cuda:
            self.actor.cuda()
            self.actor_target.cuda()
            self.critic.cuda()
            self.critic_target.cuda()

        hard_update(self.actor, self.actor_target)
        hard_update(self.critic, self.critic_target)

        self.random_process = OrnsteinUhlenbeckProcess(
            size=self.a_dim,
            theta=self.config.ou_theta,
            mu=self.config.ou_mu,
            sigma=self.config.ou_sigma)
示例#5
0
    def __init__(self, s_dim, a_dim, **kwargs):
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.config = kwargs['config']
        self.device = 'cuda' if self.config.use_cuda else 'cpu'

        self.actor = Actor(s_dim, a_dim)
        self.actor_target = Actor(s_dim, a_dim)
        self.critic = Critic(s_dim, a_dim, 1)
        self.critic_target = Critic(s_dim, a_dim, 1)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=self.config.a_lr)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=self.config.c_lr)
        self.c_loss = 0
        self.a_loss = 0

        if self.config.use_cuda:
            self.actor.cuda()
            self.actor_target.cuda()
            self.critic.cuda()
            self.critic_target.cuda()

        hard_update(self.actor, self.actor_target)
        hard_update(self.critic, self.critic_target)

        self.random_process = OrnsteinUhlenbeckProcess(
            size=self.a_dim,
            theta=self.config.ou_theta,
            mu=self.config.ou_mu,
            sigma=self.config.ou_sigma)
        self.replay_buffer = list()
        self.epsilon = 1.
        self.depsilon = self.epsilon / self.config.epsilon_decay
示例#6
0
    def __init__(self, conf, device):
        self.conf = conf
        self.state_dim = conf['state_dim']
        self.action_dim = conf['action_dim']
        self.device = device

        # create actor and critic network
        self.actor = Actor_RDPG(self.state_dim,
                                self.action_dim).to(self.device)
        self.actor_target = Actor_RDPG(self.state_dim,
                                       self.action_dim).to(self.device)

        self.critic = Critic_RDPG(self.state_dim,
                                  self.action_dim).to(self.device)
        self.critic_target = Critic_RDPG(self.state_dim,
                                         self.action_dim).to(self.device)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        self.critic_optim = optim.Adam(self.critic.parameters(), lr=q_lr)
        self.actor_optim = optim.Adam(self.actor.parameters(), lr=policy_lr)

        #Create replay buffer
        self.random_process = OrnsteinUhlenbeckProcess(size=self.action_dim,
                                                       theta=0.15,
                                                       mu=0.0,
                                                       sigma=0.2)
        # args.ou_theta:0.15 (noise theta), args.ou_sigma:0.2 (noise sigma), args.out_mu:0.0 (noise mu)

        self.epsilon = 1.0
        self.depsilon = 1.0 / 50000
        self.is_training = True
        self.tau = 0.001  # moving average for target network
示例#7
0
    def __init__(self, args, nb_states, nb_actions):
        USE_CUDA = torch.cuda.is_available()
        if args.seed > 0:
            self.seed(args.seed)

        self.nb_states = nb_states
        self.nb_actions = nb_actions
        self.gpu_ids = [i for i in range(args.gpu_nums)
                        ] if USE_CUDA and args.gpu_nums > 0 else [-1]
        self.gpu_used = True if self.gpu_ids[0] >= 0 else False

        net_cfg = {
            'hidden1': args.hidden1,
            'hidden2': args.hidden2,
            'init_w': args.init_w
        }
        self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg).double()
        self.actor_target = Actor(self.nb_states, self.nb_actions,
                                  **net_cfg).double()
        self.actor_optim = Adam(self.actor.parameters(),
                                lr=args.p_lr,
                                weight_decay=args.weight_decay)

        self.critic = Critic(self.nb_states, self.nb_actions,
                             **net_cfg).double()
        self.critic_target = Critic(self.nb_states, self.nb_actions,
                                    **net_cfg).double()
        self.critic_optim = Adam(self.critic.parameters(),
                                 lr=args.c_lr,
                                 weight_decay=args.weight_decay)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        #Create replay buffer
        self.memory = SequentialMemory(limit=args.rmsize,
                                       window_length=args.window_length)
        self.random_process = OrnsteinUhlenbeckProcess(size=self.nb_actions,
                                                       theta=args.ou_theta,
                                                       mu=args.ou_mu,
                                                       sigma=args.ou_sigma)

        # Hyper-parameters
        self.batch_size = args.bsize
        self.tau_update = args.tau_update
        self.gamma = args.gamma

        # Linear decay rate of exploration policy
        self.depsilon = 1.0 / args.epsilon
        # initial exploration rate
        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.is_training = True

        self.continious_action_space = False
示例#8
0
    def __init__(self, nb_states, nb_actions, args):

        if args.seed > 0:
            self.seed(args.seed)

        self.nb_states = nb_states
        self.nb_actions = nb_actions

        actor_net_cfg = {
            'hidden1': 32,
            'hidden2': 32,
            'hidden3': 32,
            'init_w': args.init_w
        }

        critic_net_cfg = {
            'hidden1': 64,
            'hidden2': 64,
            'hidden3': 64,
            'init_w': args.init_w
        }

        self.actor = Actor(self.nb_states, self.nb_actions, **actor_net_cfg)
        self.actor_target = Actor(self.nb_states, self.nb_actions,
                                  **actor_net_cfg)
        self.actor_optim = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_states, self.nb_actions, **critic_net_cfg)
        self.critic_target = Critic(self.nb_states, self.nb_actions,
                                    **critic_net_cfg)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        #Create replay buffer
        self.memory = SequentialMemory(limit=args.rmsize,
                                       window_length=args.window_length)
        self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                                       theta=args.ou_theta,
                                                       mu=args.ou_mu,
                                                       sigma=args.ou_sigma)

        # Hyper-parameters
        self.batch_size = args.bsize
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.is_training = True
        self.best_reward = -10
示例#9
0
    def __init__(self,
                 env,
                 mem_size=7 * int(1e3),
                 lr_critic=1e-3,
                 lr_actor=1e-4,
                 epsilon=1.,
                 max_epi=1500,
                 epsilon_decay=1. / (1e5),
                 gamma=.99,
                 target_update_frequency=200,
                 batch_size=64,
                 random_process=True,
                 max_step=None):
        self.CUDA = torch.cuda.is_available()

        self.orig_env = env  #for recording
        if max_step is not None:
            self.orig_env._max_episode_steps = max_step
        self.env = self.orig_env
        self.N_S = self.env.observation_space.shape[0]
        self.N_A = self.env.action_space.shape[0]
        self.MAX_EPI = max_epi
        self.LOW = self.env.action_space.low
        self.HIGH = self.env.action_space.high

        self.actor = Actor(self.N_S, self.N_A)
        self.critic = Critic(self.N_S, self.N_A)
        self.target_actor = Actor(self.N_S, self.N_A)
        self.target_critic = Critic(self.N_S, self.N_A)
        self.target_actor.eval()
        self.target_critic.eval()
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.target_critic.load_state_dict(self.critic.state_dict())
        if self.CUDA:
            self.actor.cuda()
            self.critic.cuda()
            self.target_actor.cuda()
            self.target_critic.cuda()

        self.exp = Experience(mem_size)
        self.optim_critic = optim.Adam(self.critic.parameters(), lr=lr_critic)
        self.optim_actor = optim.Adam(self.actor.parameters(), lr=-lr_actor)
        self.random_process = OrnsteinUhlenbeckProcess(\
                size=self.N_A, theta=.15, mu=0, sigma=.2)
        self.EPSILON = epsilon
        self.EPSILON_DECAY = epsilon_decay
        self.GAMMA = gamma
        self.TARGET_UPDATE_FREQUENCY = target_update_frequency
        self.BATCH_SIZE = batch_size

        title = {common.S_EPI: [], common.S_TOTAL_R: []}
        self.data = pd.DataFrame(title)
        self.RAND_PROC = random_process
示例#10
0
class Predator:
    def __init__(self, s_dim, a_dim, num_agent, **kwargs):
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.config = kwargs['config']
        self.num_agent = num_agent

        self.actor = Actor(s_dim, a_dim)
        self.actor_target = Actor(s_dim, a_dim)
        self.critic = Critic(s_dim, a_dim, num_agent)
        self.critic_target = Critic(s_dim, a_dim, num_agent)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=self.config.a_lr)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=self.config.c_lr)
        self.a_loss = 0
        self.c_loss = 0

        if self.config.use_cuda:
            self.actor.cuda()
            self.actor_target.cuda()
            self.critic.cuda()
            self.critic_target.cuda()

        hard_update(self.actor, self.actor_target)
        hard_update(self.critic, self.critic_target)

        self.random_process = OrnsteinUhlenbeckProcess(
            size=self.a_dim,
            theta=self.config.ou_theta,
            mu=self.config.ou_mu,
            sigma=self.config.ou_sigma)

    def get_batches(self):
        experiences = random.sample(self.replay_buffer, self.batch_size)

        state_batches = np.array([_[0] for _ in experiences])
        action_batches = np.array([_[1] for _ in experiences])
        reward_batches = np.array([_[2] for _ in experiences])
        next_state_batches = np.array([_[3] for _ in experiences])
        done_batches = np.array([_[4] for _ in experiences])

        return state_batches, action_batches, reward_batches, next_state_batches, done_batches

    def random_action(self):
        action = np.random.uniform(low=-1.,
                                   high=1.,
                                   size=(self.num_agent, self.a_dim))
        return action

    def reset(self):
        self.random_process.reset_states()
示例#11
0
    def __init__(self, env, args):  #(self, nb_states, nb_actions, args):

        if args.seed > 0:
            self.seed(args.seed)

        self.env = env

        self.nb_states = self.env.observation_space.shape[0]
        self.nb_actions = self.env.action_space.shape[0]

        # Create Actor and Critic Network
        net_cfg = {
            'hidden1': args.hidden1,
            'hidden2': args.hidden2,
            'init_w': args.init_w
        }
        self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg)
        self.actor_optim = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)

        self.load_weights(args.output)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        #Create replay buffer
        self.memory = SequentialMemory(limit=args.rmsize,
                                       window_length=args.window_length)
        self.random_process = OrnsteinUhlenbeckProcess(size=self.nb_actions,
                                                       theta=args.ou_theta,
                                                       mu=args.ou_mu,
                                                       sigma=args.ou_sigma)

        # Hyper-parameters
        self.batch_size = args.bsize
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        #
        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.is_training = True

        #
        if USE_CUDA: self.cuda()
示例#12
0
 def __init__(self,
              env,
              actor_model,
              critic_model,
              memory=10000,
              batch_size=64,
              gamma=0.99,
              tau=0.001,
              actor_lr=1e-4,
              critic_lr=1e-3,
              critic_decay=1e-2,
              ou_theta=0.15,
              ou_sigma=0.2,
              render=None,
              evaluate=None,
              save_path=None,
              save_every=10,
              render_every=10,
              train_per_step=True):
     self.env = env
     self.actor = actor_model
     self.actor_target = actor_model.clone()
     self.critic = critic_model
     self.critic_target = critic_model.clone()
     if use_cuda:
         for net in [
                 self.actor, self.actor_target, self.critic,
                 self.critic_target
         ]:
             net.cuda()
     self.memory = ReplayMemory(memory)
     self.batch_size = batch_size
     self.gamma = gamma
     self.tau = tau
     self.random_process = OrnsteinUhlenbeckProcess(
         env.action_space.shape[0], theta=ou_theta, sigma=ou_sigma)
     self.optim_critic = optim.Adam(self.critic.parameters(),
                                    lr=critic_lr,
                                    weight_decay=critic_decay)
     self.optim_actor = optim.Adam(self.actor.parameters(), lr=actor_lr)
     self.render = render
     self.render_every = render_every
     self.evaluate = evaluate
     self.save_path = save_path
     self.save_every = save_every
     self.train_per_step = train_per_step
示例#13
0
    def __init__(self, nb_states, nb_actions, args):
        if args.seed > 0:
            self.seed(args.seed)

        self.nb_states = nb_states
        self.nb_actions = nb_actions

        # Create Actor and Critic Network
        self.actor = Actor(self.nb_states, self.nb_actions, args.init_w)
        self.actor_target = Actor(self.nb_states, self.nb_actions, args.init_w)

        self.critic = Critic(self.nb_states, self.nb_actions, args.init_w)
        self.critic_target = Critic(self.nb_states, self.nb_actions,
                                    args.init_w)

        self.reward_predictor = Critic(self.nb_states, self.nb_actions,
                                       args.init_w)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        #Create replay buffer
        self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                                       theta=args.ou_theta,
                                                       mu=args.ou_mu,
                                                       sigma=args.ou_sigma)

        # Hyper-parameters
        self.batch_size = args.bsize
        self.trajectory_length = args.trajectory_length
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        #
        self.epsilon = 1.0
        self.is_training = True

        #
        if USE_CUDA: self.cuda()
示例#14
0
    def __init__(self, s_dim, a_dim, n_agents, **kwargs):
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.config = kwargs['config']
        self.n_agents = n_agents
        self.device = 'cuda' if self.config.use_cuda else 'cpu'
        # Networks
        self.policy = Actor(s_dim, a_dim, n_agents)
        self.policy_target = Actor(s_dim, a_dim, n_agents)
        self.critic = Critic(s_dim, a_dim, n_agents)
        self.critic_target = Critic(s_dim, a_dim, n_agents)

        if self.config.use_cuda:
            self.policy.cuda()
            self.policy_target.cuda()
            self.critic.cuda()
            self.critic_target.cuda()

        self.policy_optimizer = torch.optim.Adam(self.policy.parameters(),
                                                 lr=self.config.a_lr)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=self.config.c_lr)

        hard_update(self.policy, self.policy_target)
        hard_update(self.critic, self.critic_target)

        self.random_process = OrnsteinUhlenbeckProcess(
            size=self.a_dim,
            theta=self.config.ou_theta,
            mu=self.config.ou_mu,
            sigma=self.config.ou_sigma)
        self.replay_buffer = list()
        self.epsilon = 1.
        self.depsilon = self.epsilon / self.config.epsilon_decay

        self.c_loss = None
        self.a_loss = None
        self.action_log = list()
示例#15
0
    def __init__(self, nb_states, nb_actions):
        self.nb_states = nb_states
        self.nb_actions = nb_actions

        # Create Actor and Critic Network
        self.actor = Actor(self.nb_states, self.nb_actions)
        self.actor_target = Actor(self.nb_states, self.nb_actions)
        self.actor_optim = Adam(self.actor.parameters(), lr=ACTOR_LR)

        self.critic = Critic(self.nb_states, self.nb_actions)
        self.critic_target = Critic(self.nb_states, self.nb_actions)
        self.critic_optim = Adam(self.critic.parameters(), lr=CRITIC_LR)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        #Create replay buffer
        self.memory = SequentialMemory(limit=MEMORY_SIZE,
                                       window_length=HISTORY_LEN)
        self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                                       theta=OU_THETA,
                                                       mu=OU_MU,
                                                       sigma=OU_SIGMA)

        # Hyper-parameters
        self.batch_size = BATCH_SIZE
        self.tau = TAU
        self.discount = GAMMA
        self.depsilon = 1.0 / DEPSILON

        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.is_training = True

        if USE_CUDA: self.cuda()
示例#16
0
class Preyer:
    def __init__(self, s_dim, a_dim, **kwargs):
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.config = kwargs['config']
        self.device = 'cuda' if self.config.use_cuda else 'cpu'

        self.actor = Actor(s_dim, a_dim)
        self.actor_target = Actor(s_dim, a_dim)
        self.critic = Critic(s_dim, a_dim, 1)
        self.critic_target = Critic(s_dim, a_dim, 1)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=self.config.a_lr)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=self.config.c_lr)
        self.c_loss = 0
        self.a_loss = 0

        if self.config.use_cuda:
            self.actor.cuda()
            self.actor_target.cuda()
            self.critic.cuda()
            self.critic_target.cuda()

        hard_update(self.actor, self.actor_target)
        hard_update(self.critic, self.critic_target)

        self.random_process = OrnsteinUhlenbeckProcess(
            size=self.a_dim,
            theta=self.config.ou_theta,
            mu=self.config.ou_mu,
            sigma=self.config.ou_sigma)
        self.replay_buffer = list()
        self.epsilon = 1.
        self.depsilon = self.epsilon / self.config.epsilon_decay

    def memory(self, s, a, r, s_, done):
        self.replay_buffer.append((s, a, r, s_, done))

        if len(self.replay_buffer) >= self.config.memory_length:
            self.replay_buffer.pop(0)

    def get_batches(self):
        experiences = random.sample(self.replay_buffer, self.config.batch_size)

        state_batches = np.array([_[0] for _ in experiences])
        action_batches = np.array([_[1] for _ in experiences])
        reward_batches = np.array([_[2] for _ in experiences])
        next_state_batches = np.array([_[3] for _ in experiences])
        done_batches = np.array([_[4] for _ in experiences])

        return state_batches, action_batches, reward_batches, next_state_batches, done_batches

    def choose_action(self, s, noisy=True):
        if self.config.use_cuda:
            s = Variable(torch.cuda.FloatTensor(s))
        else:
            s = Variable(torch.FloatTensor(s))
        a = self.actor.forward(s).cpu().detach().numpy()

        if noisy:
            a += max(self.epsilon, 0.001) * self.random_process.sample()
            self.epsilon -= self.depsilon
        a = np.clip(a, -1., 1.)

        return np.array([a])

    def random_action(self):
        action = np.random.uniform(low=-1., high=1., size=(1, self.a_dim))
        return action

    def reset(self):
        self.random_process.reset_states()

    def train(self):
        state_batches, action_batches, reward_batches, next_state_batches, done_batches = self.get_batches(
        )

        state_batches = Variable(torch.Tensor(state_batches).to(self.device))
        action_batches = Variable(
            torch.Tensor(action_batches).reshape(-1, 1).to(self.device))
        reward_batches = Variable(
            torch.Tensor(reward_batches).reshape(-1, 1).to(self.device))
        next_state_batches = Variable(
            torch.Tensor(next_state_batches).to(self.device))
        done_batches = Variable(
            torch.Tensor(
                (done_batches == False) * 1).reshape(-1, 1).to(self.device))

        target_next_actions = self.actor_target.forward(
            next_state_batches).detach()
        target_next_q = self.critic_target.forward(
            next_state_batches, target_next_actions).detach()

        main_q = self.critic(state_batches, action_batches)

        # Critic Loss
        self.critic.zero_grad()
        baselines = reward_batches + done_batches * self.config.gamma * target_next_q
        loss_critic = torch.nn.MSELoss()(main_q, baselines)
        loss_critic.backward()
        self.critic_optimizer.step()

        # Actor Loss
        self.actor.zero_grad()
        clear_action_batches = self.actor.forward(state_batches)
        loss_actor = (
            -self.critic.forward(state_batches, clear_action_batches)).mean()
        loss_actor.backward()
        self.actor_optimizer.step()

        # This is for logging
        self.c_loss = loss_critic.item()
        self.a_loss = loss_actor.item()

        soft_update(self.actor, self.actor_target, self.config.tau)
        soft_update(self.critic, self.critic_target, self.config.tau)

    def getLoss(self):
        return self.c_loss, self.a_loss
示例#17
0
class Agent():
    def __init__(self, nb_states, nb_actions):
        self.critic = Critic(nb_states, nb_actions)  # Q
        self.critic_target = Critic(nb_states, nb_actions)
        self.actor = Actor(nb_states, nb_actions)  # policy mu
        self.actor_target = Actor(nb_states, nb_actions)

        hard_update(self.critic_target, self.critic)
        hard_update(self.actor_target, self.actor)

        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=0.001)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=0.0001)

        self.criterion = nn.MSELoss()

        self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                                       theta=0.15,
                                                       mu=0,
                                                       sigma=0.2)

        self.gamma = 0.99
        self.batch_size = 64

        if USE_CUDA:
            self.actor.cuda()
            self.actor_target.cuda()
            self.critic.cuda()
            self.critic_target.cuda()

    def act(self, obs, epsilon=0.1):  # epsilon -> tunning paramter
        if (random.random() < epsilon):  # choose random action
            action = np.random.uniform(-1., 1., nb_actions)
            return action
        else:  # the action is the output of actor network + Exploration Noise
            action = self.actor(obs).cpu().data.numpy()
            action += self.random_process.sample()
            action = np.clip(action, -1., 1.)  # to stay in interval [-1,1]
            return action

    def backward(self, transitions):

        transitions = memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))

        state_batch = Variable(torch.cat(batch.state)).type(
            FLOAT)  # size 64 x 3
        action_batch = Variable(torch.cat(batch.action)).type(FLOAT)  # size 64
        next_state_batch = Variable(torch.cat(batch.next_state)).type(
            FLOAT)  # size 64 x 3
        reward_batch = Variable(torch.cat(batch.reward)).type(FLOAT)  # size 64
        done_batch = Variable(torch.cat(batch.done)).type(FLOAT)

        #### Q - CRITIC UPDATE ####

        # Q(s_t,a_t)
        action_batch.unsqueeze_(1)  # size 64x1
        state_action_value = self.critic(state_batch, action_batch)  # 64x1

        # a_{t+1} = mu_target(s_{t+1})
        next_action = self.actor_target(
            next_state_batch).detach()  # 64 x nb_actions

        # Q'(s_{t+1},a_{t+1})
        next_state_action_value = self.critic_target(next_state_batch,
                                                     next_action).detach()
        next_state_action_value.squeeze_()  # 64

        # mask to consider next_state_values to 0 if state is terminal
        mask = Variable(
            np.logical_not(done_batch.data).type(
                torch.FloatTensor)).type(FLOAT)
        # mask = 1,1,1 ..

        # Compare Q(s_t,a_t) with r_t + gamma * Q'(s_{t+1},a_{t+1})
        expected_state_action_value = reward_batch + (
            self.gamma * next_state_action_value * mask)
        # Compute Huber loss
        # loss = F.smooth_l1_loss(state_action_values, expected_state_action_values)
        loss = self.criterion(state_action_value, expected_state_action_value)

        # Optimize the nn by updating weights with adam descent
        self.critic_optimizer.zero_grad()
        loss.backward()
        self.critic_optimizer.step()

        #### mu - ACTOR UPDATE ####

        # a_t = mu(s_t)
        action = self.actor(state_batch)

        # J = esperance[Q(s_t,mu(s_t))] -> a maximiser
        # -J = policy_loss -> a minimiser
        policy_loss = -self.critic(state_batch, action)
        policy_loss = policy_loss.mean()

        self.actor_optimizer.zero_grad()
        policy_loss.backward()
        self.actor_optimizer.step()

        #### update target network with polyak averaging
        soft_update(self.critic_target, self.critic, tau=0.001)
        soft_update(self.actor_target, self.actor, tau=0.001)
        return
示例#18
0
class Agent(object):
    def __init__(self, nb_states, nb_actions, args):
        if args.seed > 0:
            self.seed(args.seed)

        self.nb_states = nb_states
        self.nb_actions = nb_actions

        # Create Actor and Critic Network
        self.actor = Actor(self.nb_states, self.nb_actions, args.init_w)
        self.actor_target = Actor(self.nb_states, self.nb_actions, args.init_w)

        self.critic = Critic(self.nb_states, self.nb_actions, args.init_w)
        self.critic_target = Critic(self.nb_states, self.nb_actions,
                                    args.init_w)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        #Create replay buffer
        self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                                       theta=args.ou_theta,
                                                       mu=args.ou_mu,
                                                       sigma=args.ou_sigma)

        # Hyper-parameters
        self.batch_size = args.bsize
        self.trajectory_length = args.trajectory_length
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        #
        self.epsilon = 1.0
        self.is_training = True

        #
        if USE_CUDA: self.cuda()

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def random_action(self):
        action = np.random.uniform(-1., 1., self.nb_actions)
        return action

    def select_action(self, state, noise_enable=True, decay_epsilon=True):
        action, _ = self.actor(to_tensor(np.array([state])))
        action = to_numpy(action).squeeze(0)
        if noise_enable == True:
            action += self.is_training * max(self.epsilon,
                                             0) * self.random_process.sample()

        action = np.clip(action, -1., 1.)
        if decay_epsilon:
            self.epsilon -= self.depsilon

        return action

    def reset_lstm_hidden_state(self, done=True):
        self.actor.reset_lstm_hidden_state(done)

    def reset(self):
        self.random_process.reset_states()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def load_weights(self, output):
        if output is None: return False

        self.actor.load_state_dict(torch.load('{}/actor.pkl'.format(output)))

        self.critic.load_state_dict(torch.load('{}/critic.pkl'.format(output)))

        return True

    def save_model(self, output):
        if not os.path.exists(output):
            os.mkdir(output)

        torch.save(self.actor.state_dict(), '{}/actor.pkl'.format(output))
        torch.save(self.critic.state_dict(), '{}/critic.pkl'.format(output))
    def __init__(self, env, config):
        self.name = 'HierarchicalNet'
        self.save_folder = None
        self.test_record = {}
        self.train_record = {}

        self.config = config
        self.env = env
        self.epsilon = config.EPSILON

        self.commander_memory = Commander_Memory(config.MEMORY_SIZE,config.BATCH_SIZE)
        self.unit_memory = Unit_Memory(2*config.MEMORY_SIZE,config.UNIT_BATCH_SIZE)


        self.commander_actor = Commander_Actor(config.STATE_DIM,config.COMMAND_DIM,config.RNN_INSIZE)
        self.commander_actor_target = Commander_Actor(config.STATE_DIM,config.COMMAND_DIM,config.RNN_INSIZE)
        self.commander_critic = Commander_Critic(config.STATE_DIM,config.COMMAND_DIM,config.BATCH_SIZE,config.RNN_INSIZE)
        self.commander_critic_target = Commander_Critic(config.STATE_DIM,config.COMMAND_DIM,config.BATCH_SIZE,config.RNN_INSIZE)

        self.unit_actor = Unit_Actor(config.STATE_DIM,config.COMMAND_DIM,config.ACTION_DIM)
        self.unit_actor_target = Unit_Actor(config.STATE_DIM,config.COMMAND_DIM,config.ACTION_DIM)
        self.unit_critic = Unit_Critic(config.STATE_DIM,config.COMMAND_DIM,config.ACTION_DIM,config.HIDDEN_SIZE)
        self.unit_critic_target = Unit_Critic(config.STATE_DIM,config.COMMAND_DIM,config.ACTION_DIM,config.HIDDEN_SIZE)

        self.commander_actor_h0 = Variable(torch.zeros(2, 1, config.RNN_OUTSIZE),requires_grad=False)

        if config.GPU >= 0:
            self.commander_actor.cuda(device=config.GPU)
            self.commander_actor_target.cuda(device=config.GPU)
            self.commander_critic.cuda(device=config.GPU)
            self.commander_critic_target.cuda(device=config.GPU)
            self.unit_actor.cuda(device=config.GPU)
            self.unit_actor_target.cuda(device=config.GPU)
            self.unit_critic.cuda(device=config.GPU)
            self.unit_critic_target.cuda(device=config.GPU)
            self.commander_critic.h0 = self.commander_critic.h0.cuda(device=config.GPU)
            self.commander_critic_target.h0 = self.commander_critic_target.h0.cuda(device=config.GPU)
            self.commander_actor_h0 = self.commander_actor_h0.cuda(device=config.GPU)

        copy_parameter(self.commander_actor, self.commander_actor_target)
        copy_parameter(self.commander_critic, self.commander_critic_target)
        copy_parameter(self.unit_actor, self.unit_actor_target)
        copy_parameter(self.unit_critic, self.unit_critic_target)

        self.commander_actor_optimizer = optim.Adam(self.commander_actor.parameters(),lr=config.ACTOR_LR)
        self.unit_actor_optimizer = optim.Adam(self.unit_actor.parameters(),lr=config.ACTOR_LR)
        self.commander_critic_optimizer = optim.Adam(self.commander_critic.parameters(), lr=config.CRITIC_LR)
        self.unit_critic_optimizer = optim.Adam(self.unit_critic.parameters(), lr=config.CRITIC_LR)

        self.criterion = nn.MSELoss()
        self.action_noise = OrnsteinUhlenbeckProcess(size=(config.MYSELF_NUM, config.ACTION_DIM), theta=10, mu=0., sigma=2)
        self.command_noise = OrnsteinUhlenbeckProcess(size=(1,config.MYSELF_NUM, config.COMMAND_DIM), theta=10, mu=0., sigma=2)

        # self.action_noise = OrnsteinUhlenbeckProcess(size=(config.MYSELF_NUM, config.ACTION_DIM), theta=30, mu=0., sigma=3)
        # self.command_noise = OrnsteinUhlenbeckProcess(size=(1,config.MYSELF_NUM, config.COMMAND_DIM), theta=30, mu=0., sigma=3)


        # normalize
        state_normalization_myelf = [1,100,100,1,100,100,1]
        state_normalization_enemy = [1,100,100,100,100,10,100,100,1,1,1,10]
        self.state_normalization = state_normalization_myelf
        for i in range(config.K):
            self.state_normalization += state_normalization_enemy
        self.state_normalization = np.asarray(self.state_normalization,dtype=np.float32)
示例#20
0
    def __init__(
        self,
        env,
        mem_size=int(1e6),
        lr_critic=1e-3,
        lr_actor=1e-4,
        max_epi=int(1e4),
        epsilon_decay=1. / (1e5),
        gamma=.99,
        target_update_frequency=200,
        batch_size=64,
        random_process_mode='default',
        max_step=None,
        actor_update_mode='default',
        popart=False,
        actor='standard',
        critic='43',
        epsilon_start=1.,
        epsilon_end=.01,
        epsilon_rate=1. / 200,
        partition_num=100,
        env_log_freq=100,
        model_log_freq=500,
        target_update_mode='hard',
        tau=1e-3,
        grad_clip_mode=None,
        grad_clip_norm=5.,
        critic_weight_decay=0.,
        exp_trunc=[],
        exp_percent=[],
        exp_rebalance_freq=None,
        exp_type='rank',
    ):
        # configuration log
        frame = inspect.currentframe()
        args, _, _, values = inspect.getargvalues(frame)
        self.config = ['{}: {}'.format(arg, values[arg]) for arg in args]

        self.CUDA = torch.cuda.is_available()
        self.ENV_NORMALIZED = env.class_name() == 'NormalizedEnv'
        self.POPART = popart
        self.actor_update_mode = actor_update_mode

        self.orig_env = (env)  #for recording
        if max_step is not None:
            tmp_env = env
            if isinstance(tmp_env, gym.Wrapper):
                while (tmp_env.class_name() != 'TimeLimit'):
                    tmp_env = tmp_env.env
                tmp_env._max_episode_steps = max_step
        self.env = self.orig_env
        self.N = 1
        if hasattr(self.env.unwrapped, 'N'):
            self.N = self.env.unwrapped.N
        self.N_S = self.env.observation_space.shape[0]
        self.N_A = self.env.action_space.shape[0]
        self.n_s = self.N_S / self.N
        self.n_a = self.N_A / self.N
        self.MAX_EPI = max_epi
        self.LOW = self.env.action_space.low
        self.HIGH = self.env.action_space.high

        self.actor = ActorRegistry[actor](self.n_s, self.n_a)
        self.critic = CriticRegistry[critic](self.N_S, self.N_A)
        self.target_actor = ActorRegistry[actor](self.n_s, self.n_a)
        self.target_critic = CriticRegistry[critic](self.N_S, self.N_A)
        self.target_actor.eval()
        self.target_critic.eval()
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.target_critic.load_state_dict(self.critic.state_dict())
        if self.CUDA:
            self.actor.cuda()
            self.critic.cuda()
            self.target_actor.cuda()
            self.target_critic.cuda()

        # pop-art
        self.update_counter = 0
        self.beta = .1
        self.y_mean = 0.
        self.y_square_mean = 0.
        self.target_y_mean = self.y_mean
        self.target_y_square_mean = self.y_square_mean

        # per
        self.total_step = 0
        self.PARTITION_NUM = partition_num
        self.LEARN_START = mem_size / self.PARTITION_NUM + 1
        self.exp_trunc = exp_trunc
        self.exp_percent = exp_percent
        self.exp_rebalance_freq = exp_rebalance_freq
        self.exp_batch_sizes = []
        self.exp_type = exp_type
        #if len(self.exp_trunc)>0:
        if len(self.exp_trunc) != len(self.exp_percent):
            raise RuntimeError("different exp_trunc and exp_percent length")
        self.exp = []
        for i in range(len(self.exp_trunc) + 1):
            tmp_batch_size = int(
                batch_size * (1 - sum(self.exp_percent))) if i == len(
                    self.exp_trunc) else int(batch_size * self.exp_percent[i])
            self.exp_batch_sizes.append(tmp_batch_size)
            exp_conf = {
                'size': mem_size,
                'learn_start': self.LEARN_START,
                'partition_num': self.PARTITION_NUM,
                'total_step': self.MAX_EPI * 50,
                'batch_size': tmp_batch_size
            }
            self.exp.append(Experience(
                exp_conf)) if self.exp_type == 'rank' else self.exp.append(
                    PrioritizedReplayBuffer(mem_size, alpha=.7))
        #else:
        #    exp_conf = {
        #            'size': mem_size,
        #            'learn_start': self.LEARN_START,
        #            'partition_num': self.PARTITION_NUM,
        #            'total_step': self.MAX_EPI * 50,
        #            'batch_size': batch_size,
        #            }
        #    self.exp = Experience(exp_conf)

        # uniform er
        #self.exp = Experience(mem_size)
        self.optim_critic = optim.Adam(self.critic.parameters(),
                                       lr=lr_critic,
                                       weight_decay=critic_weight_decay)
        self.optim_actor = optim.Adam(self.actor.parameters(), lr=-lr_actor)
        self.random_processes = []
        for _ in xrange(self.N):
            random_process = OrnsteinUhlenbeckProcess(\
                    size=self.n_a, theta=.15, mu=0, sigma=.2)
            self.random_processes.append(random_process)
        self.EPSILON_START = epsilon_start
        self.EPSILON_END = epsilon_end
        # only default random process mode will use epsilon decay
        self.EPSILON_DECAY = epsilon_decay
        # other random process mode will use epsilon rate
        self.EPSILON_RATE = epsilon_rate
        self.GAMMA = gamma
        self.TARGET_UPDATE_FREQUENCY = target_update_frequency
        self.BATCH_SIZE = batch_size
        self.target_update_mode = target_update_mode
        self.tau = tau

        #title = {common.S_EPI:[], common.S_TOTAL_R:[]}
        #self.data = pd.DataFrame(title)
        self.RAND_PROC = random_process_mode

        self.grad_clip_mode = grad_clip_mode
        self.grad_clip_norm = grad_clip_norm

        # logger
        self.logger = None
        self.env_log_freq = env_log_freq
        self.model_log_freq = model_log_freq
        self.step = 0

        # random seed
        self.seed = int(time.time())
        random.seed(self.seed)
        np.random.seed(self.seed)
示例#21
0
def run_agent(model_params, weights, state_transform, data_queue, weights_queue,
              process, global_step, updates, best_reward, param_noise_prob, save_dir,
              max_steps=10000000):

    train_fn, actor_fn, target_update_fn, params_actor, params_crit, actor_lr, critic_lr = \
        build_model(**model_params)
    actor = Agent(actor_fn, params_actor, params_crit)
    actor.set_actor_weights(weights)

    env = RunEnv2(state_transform, max_obstacles=config.num_obstacles, skip_frame=config.skip_frames)
    random_process = OrnsteinUhlenbeckProcess(theta=.1, mu=0., sigma=.2, size=env.noutput,
                                              sigma_min=0.05, n_steps_annealing=1e6)
    # prepare buffers for data
    states = []
    actions = []
    rewards = []
    terminals = []

    total_episodes = 0
    start = time()
    action_noise = True
    while global_step.value < max_steps:
        seed = random.randrange(2**32-2)
        state = env.reset(seed=seed, difficulty=2)
        random_process.reset_states()

        total_reward = 0.
        total_reward_original = 0.
        terminal = False
        steps = 0
        
        while not terminal:
            state = np.asarray(state, dtype='float32')
            action = actor.act(state)
            if action_noise:
                action += random_process.sample()

            next_state, reward, next_terminal, info = env.step(action)
            total_reward += reward
            total_reward_original += info['original_reward']
            steps += 1
            global_step.value += 1

            # add data to buffers
            states.append(state)
            actions.append(action)
            rewards.append(reward)
            terminals.append(terminal)

            state = next_state
            terminal = next_terminal

            if terminal:
                break

        total_episodes += 1

        # add data to buffers after episode end
        states.append(state)
        actions.append(np.zeros(env.noutput))
        rewards.append(0)
        terminals.append(terminal)

        states_np = np.asarray(states).astype(np.float32)
        data = (states_np,
                np.asarray(actions).astype(np.float32),
                np.asarray(rewards).astype(np.float32),
                np.asarray(terminals),
                )
        weight_send = None
        if total_reward > best_reward.value:
            weight_send = actor.get_actor_weights()
        # send data for training
        data_queue.put((process, data, weight_send, total_reward))

        # receive weights and set params to weights
        weights = weights_queue.get()

        report_str = 'Global step: {}, steps/sec: {:.2f}, updates: {}, episode len {}, ' \
                     'reward: {:.2f}, original_reward {:.4f}; best reward: {:.2f} noise {}'. \
            format(global_step.value, 1. * global_step.value / (time() - start), updates.value, steps,
                   total_reward, total_reward_original, best_reward.value, 'actions' if action_noise else 'params')
        print(report_str)

        with open(os.path.join(save_dir, 'train_report.log'), 'a') as f:
            f.write(report_str + '\n')

        actor.set_actor_weights(weights)
        action_noise = np.random.rand() < 1 - param_noise_prob
        if not action_noise:
            set_params_noise(actor, states_np, random_process.current_sigma)

        # clear buffers
        del states[:]
        del actions[:]
        del rewards[:]
        del terminals[:]

        if total_episodes % 100 == 0:
            env = RunEnv2(state_transform, max_obstacles=config.num_obstacles, skip_frame=config.skip_frames)
示例#22
0
class RDPG_v2:
    def __init__(self, conf, device):
        self.conf = conf
        self.state_dim = conf['state_dim']
        self.action_dim = conf['action_dim']
        self.device = device

        # create actor and critic network
        self.actor = Actor_RDPG(self.state_dim,
                                self.action_dim).to(self.device)
        self.actor_target = Actor_RDPG(self.state_dim,
                                       self.action_dim).to(self.device)

        self.critic = Critic_RDPG(self.state_dim,
                                  self.action_dim).to(self.device)
        self.critic_target = Critic_RDPG(self.state_dim,
                                         self.action_dim).to(self.device)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        self.critic_optim = optim.Adam(self.critic.parameters(), lr=q_lr)
        self.actor_optim = optim.Adam(self.actor.parameters(), lr=policy_lr)

        #Create replay buffer
        self.random_process = OrnsteinUhlenbeckProcess(size=self.action_dim,
                                                       theta=0.15,
                                                       mu=0.0,
                                                       sigma=0.2)
        # args.ou_theta:0.15 (noise theta), args.ou_sigma:0.2 (noise sigma), args.out_mu:0.0 (noise mu)

        self.epsilon = 1.0
        self.depsilon = 1.0 / 50000
        self.is_training = True
        self.tau = 0.001  # moving average for target network

    def random_action(self):
        action = np.random.uniform(
            0., 1., self.action_dim)  # [-1,1] select as a number of action_dim
        return action

    def select_action(self, state, noise_enable=True, decay_epsilon=True):
        action, _ = self.actor(
            to_tensor(state).reshape(-1).unsqueeze(0)
        )  # input shape = [batch(=1) X state_dim], action : type (tuple), shape [batch X action_dim]
        action = action.cpu().detach().numpy().squeeze(
            0)  # action shape [action_dim,]
        if noise_enable == True:
            action += self.is_training * max(self.epsilon,
                                             0) * self.random_process.sample()
        action = np.clip(action, 0.,
                         1.)  # input 중 -1~1 을 벗어나는 값에 대해 -1 or 1 로 대체
        if decay_epsilon:
            self.epsilon -= self.depsilon

        return action

    def update_policy(self, memory, gamma=0.99):
        print("updating...")
        # Sample batch
        experiences = memory.sample(
            self.conf['batch_size']
        )  # type: list | shape: (max_epi_length(2000)-1 X batch(32) X 5(??))
        if len(experiences) == 0:  # not enough samples
            return
        dtype = torch.cuda.FloatTensor

        policy_loss_total = 0
        value_loss_total = 0

        for t in range(len(experiences) - 1):  # iterate over episodes
            # print("t:", t)
            target_cx = Variable(torch.zeros(self.conf['batch_size'],
                                             50)).type(dtype)
            target_hx = Variable(torch.zeros(self.conf['batch_size'],
                                             50)).type(dtype)

            cx = Variable(torch.zeros(self.conf['batch_size'], 50)).type(dtype)
            hx = Variable(torch.zeros(self.conf['batch_size'], 50)).type(dtype)

            # we first get the data out of the sampled experience
            # shape of state0, action, reward: [batch X state_dim], [batch X 1], [batch X 1]
            state0 = np.stack([
                trajectory.state0 for trajectory in experiences[t]
            ])  # batch 개수만큼 각 epi 중 t 시점에서 상태만 추출
            # action = np.expand_dims(np.stack((trajectory.action for trajectory in experiences[t])), axis=1)
            action = np.stack(
                [trajectory.action for trajectory in experiences[t]])
            reward = np.expand_dims(np.stack(
                [trajectory.reward for trajectory in experiences[t]]),
                                    axis=1)
            # reward = np.stack((trajectory.reward for trajectory in experiences[t]))
            state1 = np.stack(
                [trajectory.state0 for trajectory in experiences[t + 1]])

            target_action, (target_hx, target_cx) = self.actor_target(
                to_tensor(state1).reshape(self.conf['batch_size'], -1),
                (target_hx, target_cx))
            next_q_value = self.critic_target([
                to_tensor(state1).reshape(self.conf['batch_size'], -1),
                target_action
            ])

            target_q = to_tensor(reward) + gamma * next_q_value

            # Critic update
            current_q = self.critic([
                to_tensor(state0).reshape(self.conf['batch_size'], -1),
                to_tensor(action)
            ])

            value_loss = F.smooth_l1_loss(current_q, target_q)
            value_loss /= len(experiences)  # divide by trajectory length
            value_loss_total += value_loss
            # update per trajectory
            self.critic.zero_grad()
            value_loss.backward()

            # Actor update
            action, (hx, cx) = self.actor(
                to_tensor(state0).reshape(self.conf['batch_size'], -1),
                (hx, cx))
            policy_loss = -self.critic([
                to_tensor(state0).reshape(self.conf['batch_size'], -1), action
            ])
            policy_loss /= len(experiences)  # divide by trajectory length
            policy_loss_total += policy_loss.mean()
            policy_loss = policy_loss.mean()
            self.actor.zero_grad()
            policy_loss.backward()

            self.critic_optim.step()
            self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)
        print("update finish!")

    def reset_lstm_hidden_state(self, done=True):
        self.actor.reset_lstm_hidden_state(done)

    def save_model(self, path):
        torch.save(self.critic.state_dict(), path + '_q')
        torch.save(self.critic_target.state_dict(), path + '_target_q')
        torch.save(self.actor.state_dict(), path + '_policy')

    def load_model(self, path):
        self.critic.load_state_dict(torch.load(path + '_q'))
        self.critic_target.load_state_dict(torch.load(path + '_target_q'))
        self.actor.load_state_dict(torch.load(path + '_policy'))
        self.critic.eval()
        self.critic_target.eval()
        self.actor.eval()
示例#23
0
class DDPG(object):
    def __init__(self, args, nb_states, nb_actions):
        USE_CUDA = torch.cuda.is_available()
        if args.seed > 0:
            self.seed(args.seed)

        self.nb_states =  nb_states
        self.nb_actions= nb_actions
        self.gpu_ids = [i for i in range(args.gpu_nums)] if USE_CUDA and args.gpu_nums > 0 else [-1]
        self.gpu_used = True if self.gpu_ids[0] >= 0 else False

        net_cfg = {
            'hidden1':args.hidden1,
            'hidden2':args.hidden2,
            'init_w':args.init_w
        }
        self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg).double()
        self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg).double()
        self.actor_optim  = Adam(self.actor.parameters(), lr=args.p_lr, weight_decay=args.weight_decay)

        self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg).double()
        self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg).double()
        self.critic_optim  = Adam(self.critic.parameters(), lr=args.c_lr, weight_decay=args.weight_decay)

        hard_update(self.actor_target, self.actor) # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)
        
        #Create replay buffer
        self.memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.random_process = OrnsteinUhlenbeckProcess(size=self.nb_actions,
                                                       theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma)

        # Hyper-parameters
        self.batch_size = args.bsize
        self.tau_update = args.tau_update
        self.gamma = args.gamma

        # Linear decay rate of exploration policy
        self.depsilon = 1.0 / args.epsilon
        # initial exploration rate
        self.epsilon = 1.0
        self.s_t = None # Most recent state
        self.a_t = None # Most recent action
        self.is_training = True

        self.continious_action_space = False

    def update_policy(self):
        pass

    def cuda_convert(self):
        if len(self.gpu_ids) == 1:
            if self.gpu_ids[0] >= 0:
                with torch.cuda.device(self.gpu_ids[0]):
                    print('model cuda converted')
                    self.cuda()
        if len(self.gpu_ids) > 1:
            self.data_parallel()
            self.cuda()
            self.to_device()
            print('model cuda converted and paralleled')

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def data_parallel(self):
        self.actor = nn.DataParallel(self.actor, device_ids=self.gpu_ids)
        self.actor_target = nn.DataParallel(self.actor_target, device_ids=self.gpu_ids)
        self.critic = nn.DataParallel(self.critic, device_ids=self.gpu_ids)
        self.critic_target = nn.DataParallel(self.critic_target, device_ids=self.gpu_ids)

    def to_device(self):
        self.actor.to(torch.device('cuda:{}'.format(self.gpu_ids[0])))
        self.actor_target.to(torch.device('cuda:{}'.format(self.gpu_ids[0])))
        self.critic.to(torch.device('cuda:{}'.format(self.gpu_ids[0])))
        self.critic_target.to(torch.device('cuda:{}'.format(self.gpu_ids[0])))

    def observe(self, r_t, s_t1, done):
        if self.is_training:
            self.memory.append(self.s_t, self.a_t, r_t, done)
            self.s_t = s_t1

    def random_action(self):
        action = np.random.uniform(-1.,1.,self.nb_actions)
        # self.a_t = action
        return action

    def select_action(self, s_t, decay_epsilon=True):
        # proto action
        action = to_numpy(
            self.actor(to_tensor(np.array([s_t]), gpu_used=self.gpu_used, gpu_0=self.gpu_ids[0])),
            gpu_used=self.gpu_used
        ).squeeze(0)
        action += self.is_training * max(self.epsilon, 0) * self.random_process.sample()
        action = np.clip(action, -1., 1.)

        if decay_epsilon:
            self.epsilon -= self.depsilon
        
        # self.a_t = action
        return action

    def reset(self, s_t):
        self.s_t = s_t
        self.random_process.reset_states()

    def load_weights(self, dir):
        if dir is None: return

        if self.gpu_used:
            # load all tensors to GPU (gpu_id)
            ml = lambda storage, loc: storage.cuda(self.gpu_ids)
        else:
            # load all tensors to CPU
            ml = lambda storage, loc: storage

        self.actor.load_state_dict(
            torch.load('output/{}/actor.pkl'.format(dir), map_location=ml)
        )

        self.critic.load_state_dict(
            torch.load('output/{}/critic.pkl'.format(dir), map_location=ml)
        )
        print('model weights loaded')


    def save_model(self,output):
        if len(self.gpu_ids) == 1 and self.gpu_ids[0] > 0:
            with torch.cuda.device(self.gpu_ids[0]):
                torch.save(
                    self.actor.state_dict(),
                    '{}/actor.pt'.format(output)
                )
                torch.save(
                    self.critic.state_dict(),
                    '{}/critic.pt'.format(output)
                )
        elif len(self.gpu_ids) > 1:
            torch.save(self.actor.module.state_dict(),
                       '{}/actor.pt'.format(output)
            )
            torch.save(self.actor.module.state_dict(),
                       '{}/critic.pt'.format(output)
                       )
        else:
            torch.save(
                self.actor.state_dict(),
                '{}/actor.pt'.format(output)
            )
            torch.save(
                self.critic.state_dict(),
                '{}/critic.pt'.format(output)
            )

    def seed(self,seed):
        torch.manual_seed(seed)
        if len(self.gpu_ids) > 0:
            torch.cuda.manual_seed_all(seed)
示例#24
0
class BiCNet():
    def __init__(self, s_dim, a_dim, n_agents, **kwargs):
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.config = kwargs['config']
        self.n_agents = n_agents
        self.device = 'cuda' if self.config.use_cuda else 'cpu'
        # Networks
        self.policy = Actor(s_dim, a_dim, n_agents)
        self.policy_target = Actor(s_dim, a_dim, n_agents)
        self.critic = Critic(s_dim, a_dim, n_agents)
        self.critic_target = Critic(s_dim, a_dim, n_agents)

        if self.config.use_cuda:
            self.policy.cuda()
            self.policy_target.cuda()
            self.critic.cuda()
            self.critic_target.cuda()

        self.policy_optimizer = torch.optim.Adam(self.policy.parameters(),
                                                 lr=self.config.a_lr)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=self.config.c_lr)

        hard_update(self.policy, self.policy_target)
        hard_update(self.critic, self.critic_target)

        self.random_process = OrnsteinUhlenbeckProcess(
            size=self.a_dim,
            theta=self.config.ou_theta,
            mu=self.config.ou_mu,
            sigma=self.config.ou_sigma)
        self.replay_buffer = list()
        self.epsilon = 1.
        self.depsilon = self.epsilon / self.config.epsilon_decay

        self.c_loss = None
        self.a_loss = None
        self.action_log = list()

    def choose_action(self, obs, noisy=True):
        obs = torch.Tensor([obs]).to(self.device)

        action = self.policy(obs).cpu().detach().numpy()[0]
        self.action_log.append(action)

        if noisy:
            for agent_idx in range(self.n_agents):
                pass
                # action[agent_idx] += self.epsilon * self.random_process.sample()
            self.epsilon -= self.depsilon
            self.epsilon = max(self.epsilon, 0.001)
        np.clip(action, -1., 1.)

        return action

    def reset(self):
        self.random_process.reset_states()
        self.action_log.clear()

    def prep_train(self):
        self.policy.train()
        self.critic.train()
        self.policy_target.train()
        self.critic_target.train()

    def prep_eval(self):
        self.policy.eval()
        self.critic.eval()
        self.policy_target.eval()
        self.critic_target.eval()

    def random_action(self):
        return np.random.uniform(low=-1, high=1, size=(self.n_agents, 2))

    def memory(self, s, a, r, s_, done):
        self.replay_buffer.append((s, a, r, s_, done))

        if len(self.replay_buffer) >= self.config.memory_length:
            self.replay_buffer.pop(0)

    def get_batches(self):
        experiences = random.sample(self.replay_buffer, self.config.batch_size)

        state_batches = np.array([_[0] for _ in experiences])
        action_batches = np.array([_[1] for _ in experiences])
        reward_batches = np.array([_[2] for _ in experiences])
        next_state_batches = np.array([_[3] for _ in experiences])
        done_batches = np.array([_[4] for _ in experiences])

        return state_batches, action_batches, reward_batches, next_state_batches, done_batches

    def train(self):

        state_batches, action_batches, reward_batches, next_state_batches, done_batches = self.get_batches(
        )

        state_batches = torch.Tensor(state_batches).to(self.device)
        action_batches = torch.Tensor(action_batches).to(self.device)
        reward_batches = torch.Tensor(reward_batches).reshape(
            self.config.batch_size, self.n_agents, 1).to(self.device)
        next_state_batches = torch.Tensor(next_state_batches).to(self.device)
        done_batches = torch.Tensor(
            (done_batches == False) * 1).reshape(self.config.batch_size,
                                                 self.n_agents,
                                                 1).to(self.device)

        target_next_actions = self.policy_target.forward(next_state_batches)
        target_next_q = self.critic_target.forward(next_state_batches,
                                                   target_next_actions)
        main_q = self.critic(state_batches, action_batches)
        '''
        How to concat each agent's Q value?
        '''
        #target_next_q = target_next_q
        #main_q = main_q.mean(dim=1)
        '''
        Reward Norm
        '''
        # reward_batches = (reward_batches - reward_batches.mean(dim=0)) / reward_batches.std(dim=0) / 1024

        # Critic Loss
        self.critic.zero_grad()
        baselines = reward_batches + done_batches * self.config.gamma * target_next_q
        loss_critic = torch.nn.MSELoss()(main_q, baselines.detach())
        loss_critic.backward()
        torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5)
        self.critic_optimizer.step()

        # Actor Loss
        self.policy.zero_grad()
        clear_action_batches = self.policy.forward(state_batches)
        loss_actor = -self.critic.forward(state_batches,
                                          clear_action_batches).mean()
        loss_actor += (clear_action_batches**2).mean() * 1e-3
        loss_actor.backward()
        torch.nn.utils.clip_grad_norm_(self.policy.parameters(), 0.5)
        self.policy_optimizer.step()

        # This is for logging
        self.c_loss = loss_critic.item()
        self.a_loss = loss_actor.item()

        soft_update(self.policy, self.policy_target, self.config.tau)
        soft_update(self.critic, self.critic_target, self.config.tau)

    def get_loss(self):
        return self.c_loss, self.a_loss

    def get_action_std(self):
        return np.array(self.action_log).std(axis=-1).mean()
示例#25
0
class DDPG(object):
    def __init__(self, nb_states, nb_actions, args):

        if args.seed > 0:
            self.seed(args.seed)

        self.nb_states = nb_states
        self.nb_actions = nb_actions

        # Create Actor and Critic Network
        net_cfg = {
            "hidden1": args.hidden1,
            "hidden2": args.hidden2,
            "init_w": args.init_w,
        }
        self.actor = Actor(self.nb_states, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_states, self.nb_actions, **net_cfg)
        self.actor_optim = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(
            self.actor_target, self.actor
        )  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        # Create replay buffer
        self.memory = SequentialMemory(
            limit=args.rmsize, window_length=args.window_length
        )
        self.random_process = OrnsteinUhlenbeckProcess(
            size=nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma
        )

        # Hyper-parameters
        self.batch_size = args.bsize
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        #
        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.is_training = True

        #
        if USE_CUDA:
            self.cuda()

    def update_policy(self):
        # Sample batch
        (
            state_batch,
            action_batch,
            reward_batch,
            next_state_batch,
            terminal_batch,
        ) = self.memory.sample_and_split(self.batch_size)

        # Prepare for the target q batch
        next_q_values = self.critic_target(
            [
                to_tensor(next_state_batch, volatile=True),
                self.actor_target(to_tensor(next_state_batch, volatile=True)),
            ]
        )
        # next_q_values.volatile = False

        target_q_batch = (
            to_tensor(reward_batch)
            + self.discount * to_tensor(terminal_batch.astype(np.float)) * next_q_values
        )

        # Critic update
        self.critic.zero_grad()

        q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)])

        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()

        # Actor update
        self.actor.zero_grad()

        policy_loss = -self.critic(
            [to_tensor(state_batch), self.actor(to_tensor(state_batch))]
        )

        policy_loss = policy_loss.mean()
        policy_loss.backward()
        self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        if self.is_training:
            self.memory.append(self.s_t, self.a_t, r_t, done)
            self.s_t = s_t1

    def random_action(self):
        action = np.random.uniform(-1.0, 1.0, self.nb_actions)
        self.a_t = action
        return action

    def select_action(self, s_t, decay_epsilon=True):
        action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0)
        action += self.is_training * max(self.epsilon, 0) * self.random_process.sample()
        action = np.clip(action, -1.0, 1.0)

        if decay_epsilon:
            self.epsilon -= self.depsilon

        self.a_t = action
        return action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_states()

    def load_weights(self, output):
        if output is None:
            return

        self.actor.load_state_dict(torch.load("{}/actor.pkl".format(output)))

        self.critic.load_state_dict(torch.load("{}/critic.pkl".format(output)))

    def save_model(self, output):
        torch.save(self.actor.state_dict(), "{}/actor.pkl".format(output))
        torch.save(self.critic.state_dict(), "{}/critic.pkl".format(output))

    def seed(self, s):
        torch.manual_seed(s)
        if USE_CUDA:
            torch.cuda.manual_seed(s)
示例#26
0
def run_agent(args,
              model_params,
              weights,
              data_queue,
              weights_queue,
              process,
              global_step,
              updates,
              best_reward,
              param_noise_prob,
              save_dir,
              max_steps=10000000):

    train_fn, actor_fn, target_update_fn, params_actor, params_crit, actor_lr, critic_lr = build_model(
        **model_params)
    actor = Agent(actor_fn, params_actor, params_crit)
    actor.set_actor_weights(weights)

    env = RunEnv2(model=args.modeldim,
                  prosthetic=args.prosthetic,
                  difficulty=args.difficulty,
                  skip_frame=config.skip_frames)
    env.spec.timestep_limit = 3000  # ndrw
    # random_process = OrnsteinUhlenbeckProcess(theta=.1, mu=0., sigma=.3, size=env.noutput, sigma_min=0.05, n_steps_annealing=1e6)

    sigma_rand = random.uniform(0.05, 0.5)
    dt_rand = random.uniform(0.002, 0.02)
    param_noise_prob = random.uniform(param_noise_prob * 0.25,
                                      min(param_noise_prob * 1.5, 1.))

    random_process = OrnsteinUhlenbeckProcess(theta=.1,
                                              mu=0.,
                                              sigma=sigma_rand,
                                              dt=dt_rand,
                                              size=env.noutput,
                                              sigma_min=0.05,
                                              n_steps_annealing=1e6)

    print('OUProcess_sigma = ' + str(sigma_rand) + '    OUProcess_dt = ' +
          str(dt_rand) + '    param_noise_prob = ' + str(param_noise_prob))

    # prepare buffers for data
    states = []
    actions = []
    rewards = []
    terminals = []

    total_episodes = 0
    start = time()
    action_noise = True
    while global_step.value < max_steps:
        seed = random.randrange(2**32 - 2)
        state = env.reset(seed=seed, difficulty=args.difficulty)
        random_process.reset_states()

        total_reward = 0.
        total_reward_original = 0.
        terminal = False
        steps = 0

        while not terminal:
            state = np.asarray(state, dtype='float32')
            action = actor.act(state)
            if action_noise:
                action += random_process.sample()

            next_state, reward, next_terminal, info = env._step(action)
            total_reward += reward
            total_reward_original += info['original_reward']
            steps += 1
            global_step.value += 1

            # add data to buffers
            states.append(state)
            actions.append(action)
            rewards.append(reward)
            terminals.append(terminal)

            state = next_state
            terminal = next_terminal

            if terminal:
                break

        total_episodes += 1

        # add data to buffers after episode end
        states.append(state)
        actions.append(np.zeros(env.noutput))
        rewards.append(0)
        terminals.append(terminal)

        states_np = np.asarray(states).astype(np.float32)
        data = (
            states_np,
            np.asarray(actions).astype(np.float32),
            np.asarray(rewards).astype(np.float32),
            np.asarray(terminals),
        )
        weight_send = None
        if total_reward > best_reward.value:
            weight_send = actor.get_actor_weights()
        # send data for training
        data_queue.put((process, data, weight_send, total_reward))

        # receive weights and set params to weights
        weights = weights_queue.get()

        # report_str = 'Global step: {}, steps/sec: {:.2f}, updates: {}, episode len: {}, pelvis_X: {:.2f}, reward: {:.2f}, original_reward {:.4f}, best reward: {:.2f}, noise: {}'. \
        #     format(global_step.value, 1. * global_step.value / (time() - start), updates.value, steps, info['pelvis_X'], total_reward, total_reward_original, best_reward.value, 'actions' if action_noise else 'params')
        # report_str = 'Global step: {}, steps/sec: {:.2f}, updates: {}, episode len: {}, pelvis_X: {:.2f}, reward: {:.2f}, best reward: {:.2f}, noise: {}'. \
        #     format(global_step.value, 1. * global_step.value / (time() - start), updates.value, steps, info['pelvis_X'], total_reward, best_reward.value, 'actions' if action_noise else 'params')
        report_str = 'Global step: {}, steps/sec: {:.2f}, updates: {}, episode len: {}, pelvis_X: {:.2f}, pelvis_Z: {:.2f}, reward: {:.2f}, best reward: {:.2f}, noise: {}'. \
            format(global_step.value, 1. * global_step.value / (time() - start), updates.value, steps, info['pelvis'][0], info['pelvis'][2], total_reward, best_reward.value, 'actions' if action_noise else 'params')
        print(report_str)

        try:
            with open(os.path.join(save_dir, 'train_report.log'), 'a') as f:
                f.write(report_str + '\n')
        except:
            print('#############################################')
            print(
                'except  »  with open(os.path.join(save_dir, train_report.log), a) as f:'
            )
            print('#############################################')

        actor.set_actor_weights(weights)
        action_noise = np.random.rand() < 1 - param_noise_prob
        if not action_noise:
            set_params_noise(actor, states_np, random_process.current_sigma)

        # clear buffers
        del states[:]
        del actions[:]
        del rewards[:]
        del terminals[:]

        if total_episodes % 100 == 0:
            env = RunEnv2(model=args.modeldim,
                          prosthetic=args.prosthetic,
                          difficulty=args.difficulty,
                          skip_frame=config.skip_frames)
示例#27
0
class DDPG_trainer(object):
    def __init__(self, nb_state, nb_action):
        self.nb_state = nb_state
        self.nb_action = nb_action

        self.actor = Actor(self.nb_state, self.nb_action)
        self.actor_target = Actor(self.nb_state, self.nb_action)
        self.actor_optim = Adam(self.actor.parameters(), lr=LEARNING_RATE)

        self.critic = Critic(self.nb_state, self.nb_action)
        self.critic_target = Critic(self.nb_state, self.nb_action)
        self.critic_optim = Adam(self.critic.parameters(), lr=LEARNING_RATE)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        #Create replay buffer
        self.memory = SequentialMemory(limit=MEMORY_SIZE, window_length=1)
        self.random_process = OrnsteinUhlenbeckProcess(size=nb_action,
                                                       theta=OU_THETA,
                                                       mu=OU_MU,
                                                       sigma=OU_SIGMA)

        self.is_training = True
        self.epsilon = 1.0
        self.a_t = None
        self.s_t = None

        if USE_CUDA: self.cuda()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def select_action(self, s_t, decay_epsilon=True):

        action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0)
        action += self.is_training * max(self.epsilon,
                                         0) * self.random_process.sample()
        action = np.clip(action, -1., 1.)

        if decay_epsilon:
            self.epsilon -= DELTA_EPSILON

        self.a_t = action
        return action

    def reset(self, observation):
        self.start_state = observation
        self.random_process.reset_states()

    def observe(self, r_t, s_t1, done):

        if self.is_training:
            self.memory.append(self.s_t, self.a_t, r_t, done)
            self.s_t = s_t1

    def update_all(self):
        # Help Warm Up
        if self.memory.nb_entries < BATCH_SIZE * 2:
            return

        # Sample batch
        state_batch, action_batch, reward_batch, \
        next_state_batch, terminal_batch = self.memory.sample_and_split(BATCH_SIZE)

        # Prepare for the target q batch
        with torch.no_grad():
            next_q_values = self.critic_target([
                to_tensor(next_state_batch),
                self.actor_target(to_tensor(next_state_batch)),
            ])

        target_q_batch = to_tensor(reward_batch) + \
                         DISCOUNT * to_tensor(terminal_batch.astype(np.float)) * next_q_values

        # Critic update
        self.critic.zero_grad()
        for state in state_batch:
            if state.shape[0] <= 2:
                # print("Error sampled memory!")
                return

        q_batch = self.critic(
            [to_tensor(state_batch),
             to_tensor(action_batch)])
        value_loss = CRITERION(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()

        # Actor update
        self.actor.zero_grad()

        policy_loss = -self.critic(
            [to_tensor(state_batch),
             self.actor(to_tensor(state_batch))])

        policy_loss = policy_loss.mean()
        policy_loss.backward()
        self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, TAU)
        soft_update(self.critic_target, self.critic, TAU)
示例#28
0
    def __init__(self, gamma, tau, actor_hidden_size, critic_hidden_size,
                 observation_space, action_space, args):

        self.num_inputs = observation_space.shape[0]
        self.action_space = action_space
        self.actor_hidden_size = actor_hidden_size
        self.critic_hidden_size = critic_hidden_size
        self.comm_hidden_size = actor_hidden_size // 2
        self.gamma = gamma
        self.tau = tau
        self.args = args
        # replay for the update of attention unit
        self.queue = queue.Queue()

        # Define actor part 1
        self.actor_p1 = ActorPart1(self.num_inputs,
                                   actor_hidden_size).to(device)
        self.actor_target_p1 = ActorPart1(self.num_inputs,
                                          actor_hidden_size).to(device)

        # attention unit is not end-to-end trained
        self.atten = AttentionUnit(actor_hidden_size,
                                   actor_hidden_size).to(device)
        self.atten_optim = Adam(self.atten.parameters(), lr=self.args.actor_lr)

        # Define Communication Channel
        self.comm = CommunicationChannel(actor_hidden_size,
                                         self.comm_hidden_size).to(device)
        self.comm_target = CommunicationChannel(
            actor_hidden_size, self.comm_hidden_size).to(device)
        self.comm_optim = Adam(self.comm.parameters(), lr=self.args.actor_lr)

        # Define actor part 2
        # input -- [thoughts, intergrated thoughts]
        self.actor_p2 = ActorPart2(
            actor_hidden_size + self.comm_hidden_size * 2, self.action_space,
            actor_hidden_size).to(device)
        self.actor_target_p2 = ActorPart2(
            actor_hidden_size + self.comm_hidden_size * 2, self.action_space,
            actor_hidden_size).to(device)
        self.actor_optim = Adam([{
            'params': self.actor_p1.parameters(),
            'lr': self.args.actor_lr
        }, {
            'params': self.actor_p2.parameters(),
            'lr': self.args.actor_lr
        }])

        self.critic = Critic(self.num_inputs, self.action_space,
                             critic_hidden_size).to(device)
        self.critic_target = Critic(self.num_inputs, self.action_space,
                                    critic_hidden_size).to(device)
        self.critic_optim = Adam(self.critic.parameters(),
                                 lr=self.args.critic_lr)

        # Make sure target is with the same weight
        hard_update(self.actor_target_p1, self.actor_p1)
        hard_update(self.comm_target, self.comm)
        hard_update(self.actor_target_p2, self.actor_p2)
        hard_update(self.critic_target, self.critic)

        # Create replay buffer
        self.memory = ReplayMemory(args.memory_size)
        self.random_process = OrnsteinUhlenbeckProcess(size=action_space.n,
                                                       theta=args.ou_theta,
                                                       mu=args.ou_mu,
                                                       sigma=args.ou_sigma)
        critic = Critic(state_dim, action_dim, max_action, args)
        critic_t = Critic(state_dim, action_dim, max_action, args)
        critic_t.load_state_dict(critic.state_dict())

    print("OK 3")
    # actor
    actor = Actor(state_dim, action_dim, max_action, args)
    actor_t = Actor(state_dim, action_dim, max_action, args)
    actor_t.load_state_dict(actor.state_dict())

    # action noise
    if not args.ou_noise:
        a_noise = GaussianNoise(action_dim, sigma=args.gauss_sigma)
    else:
        a_noise = OrnsteinUhlenbeckProcess(action_dim,
                                           mu=args.ou_mu,
                                           theta=args.ou_theta,
                                           sigma=args.ou_sigma)

    if USE_CUDA:
        critic.cuda()
        critic_t.cuda()
        actor.cuda()
        actor_t.cuda()

    print("OK 4")
    # CEM
    es = sepCEM(actor.get_size(),
                mu_init=actor.get_params(),
                sigma_init=args.sigma_init,
                damp=args.damp,
                damp_limit=args.damp_limit,
示例#30
0
class DDPG(object):
    def __init__(self,
                 state_size,
                 action_size,
                 memory_size,
                 batch_size=128,
                 tan=0.001,
                 actor_lr=0.001,
                 critic_lr=0.001,
                 epsilon=1.):

        self.state_size = state_size
        self.action_size = action_size
        self.batch_size = batch_size
        self.tan = tan
        self.warmup = WARM_UP
        self.epsilon = epsilon
        self.epsilon_decay = hyperparameters['D_EPSILON']

        self.actor = Actor(state_size, action_size)
        self.actor_target = Actor(state_size, action_size)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr)
        self.critic = Critic(state_size, action_size)
        self.critic_target = Critic(state_size, action_size)
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=critic_lr)
        self.memory = Memory(memory_size)
        self.criterion = nn.MSELoss()

        self.random_process = OrnsteinUhlenbeckProcess(size=action_size,
                                                       theta=0.15,
                                                       mu=0.,
                                                       sigma=0.2)

        copy_parameter(self.actor, self.actor_target)
        copy_parameter(self.critic, self.critic_target)

    def train(self):

        # if not warm up
        if self.memory.counter < self.warmup:
            return

        # get batch
        state_batch, action_batch, next_state_batch, reward_batch, done_batch = self.memory.sample(
            self.batch_size)
        action_batch = action_batch.reshape((-1, self.action_size))
        reward_batch = reward_batch.reshape((-1, 1))
        done_batch = done_batch.reshape((-1, 1))

        # update critic
        nsb = Variable(torch.from_numpy(next_state_batch).float(),
                       volatile=True)  # next_state_batch
        nab = self.actor_target(nsb)  # next_action_batch
        next_q = self.critic_target(nsb, nab)
        next_q.volatile = False

        rb = Variable(torch.from_numpy(reward_batch).float())  # reward_batch
        db = Variable(torch.from_numpy(done_batch).float(
        ))  # if next state is None, next_q should be 0, which means q = r
        q_target = rb + hyperparameters['GAMMA'] * db * next_q

        sb_grad = Variable(torch.from_numpy(state_batch).float()
                           )  # state_batch with grad, mean output need grad
        ab = Variable(torch.from_numpy(action_batch).float())  # action_batch
        q_eval = self.critic(sb_grad, ab)

        value_loss = self.criterion(q_eval, q_target)
        self.critic.zero_grad()
        value_loss.backward()
        # nn.utils.clip_grad_norm(self.critic.parameters(),0.8)
        self.critic_optimizer.step()

        # update actor
        sb_grad = Variable(
            torch.from_numpy(state_batch).float())  # state_batch
        aab = self.actor(sb_grad)  # actor_action_batch

        q = self.critic(sb_grad, aab)
        policy_loss = torch.mean(-q)
        self.actor.zero_grad()
        policy_loss.backward()
        # nn.utils.clip_grad_norm(self.actor.parameters(),0.8)
        self.actor_optimizer.step()

        # update parameter between two network
        update_parameter(self.critic_target, self.critic, self.tan)
        update_parameter(self.actor_target, self.actor, self.tan)

    def select_action(self, s, is_train=True, decay_e=True):
        if self.memory.counter < self.warmup:
            action = env.action_space.sample()[0]
            # action = random.uniform(-2.,2.)
            return action
        state = Variable(torch.FloatTensor([s]).float())
        action = self.actor(state).squeeze(1).data.numpy()
        action += is_train * max(self.epsilon,
                                 0) * self.random_process.sample()
        action = float(np.clip(action, -1., 1.)[0])

        if decay_e:
            if self.memory.counter > self.warmup:
                self.epsilon -= self.epsilon_decay
        return action