示例#1
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        #Policy Model & Value Model
        self.actorLocal = Actor(self.state_size, self.action_size,
                                self.action_low, self.action_high)
        self.criticLocal = Critic(self.state_size, self.action_size)
        self.actorTarget = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.criticTarget = Critic(self.state_size, self.action_size)

        #Initializing target model with local model params
        self.criticTarget.model.set_weights(
            self.criticLocal.model.get_weights())
        self.actorTarget.model.set_weights(self.actorLocal.model.get_weights())

        #Replay Buffer
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        self.noise = OUNoise(self.action_size, 0, 0.1, 0.25)
        self.discountGamma = 0.9
示例#2
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters
示例#3
0
    def __init__(self, task):
        print('loaded DDPG ', task.observation_space.shape[0])
        self.task = task
        self.state_size = task.observation_space.shape[0]
        self.action_size = task.action_space.shape[0]
        self.action_low = task.action_space.low
        self.action_high = task.action_space.high
        self.action_range = self.action_high - self.action_low

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.995  # discount factor
        self.tau = 1.  # for soft update of target parameters

        # Score tracker and learning parameters
        self.best_w = None
        self.best_score = -np.inf
        self.noise_scale = 0.1
        self.w = np.random.normal(
            size=(
                self.state_size, self.action_size
            ),  # weights for simple linear policy: state_space x action_space
            scale=(self.action_range / (2 * self.state_size)
                   ))  # start producing actions in a decent range
示例#4
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        actor_local_params = actor_params()
        actor_target_params = actor_params()
        self.actor_local = Actor(self.state_size,
                                 self.action_size,
                                 self.action_low,
                                 self.action_high,
                                 params=actor_local_params)
        self.actor_target = Actor(self.state_size,
                                  self.action_size,
                                  self.action_low,
                                  self.action_high,
                                  params=actor_target_params)

        # Critic (Value) Model
        critic_local_params = critic_params()
        critic_target_params = critic_params()
        self.critic_local = Critic(self.state_size,
                                   self.action_size,
                                   params=critic_local_params)
        self.critic_target = Critic(self.state_size,
                                    self.action_size,
                                    params=critic_target_params)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        agent_par = agent_params(self.action_size)
        # Noise process
        self.noise = agent_par.noise
        # Replay memory
        self.batch_size = agent_par.batch_size
        self.memory = agent_par.memory
        # Algorithm parameters
        self.gamma = agent_par.gamma  # discount factor
        self.tau = agent_par.tau  # for soft update of target parameters

        # Compute the ongoing top score
        self.top_score = -np.inf
        self.score = 0
示例#5
0
    def __init__(self, task, log=None):
        self.task = task
        #Add log utility
        self.log = log
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.log)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.log)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size, self.log)
        self.critic_target = Critic(self.state_size, self.action_size, self.log)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        # @hakimka on the forums said to use mu = 0.001, theta = 0.05, sigma = 0.0015, but...
        self.exploration_mu = 0.001
        self.exploration_theta = 0.05
        self.exploration_sigma = 0.0015
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 128
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size, self.log)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor 0.99
        self.tau = 0.01  # for soft update of target parameters  0.01

        # score
        self.total_reward = 0.0
        self.count = 0
        self.score = 0
        self.best_score = -np.inf
示例#6
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        #print("initializing DDPG agent")
        # Actor (Policy) Model
        #print("initializing actor_local")
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        #print("initializing actor_target")
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        #print("initializing critic_local")
        self.critic_local = Critic(self.state_size, self.action_size)
        #print("initializing critic_target")
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 1.0  # initial value was 0
        self.exploration_theta = 0.5  # initial value was 0.15
        self.exploration_sigma = 0.15  # initial value was 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000  # initially this was buffer_size = 100000
        self.batch_size = 64  # initial was 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.95  # discount factor initial value was 0.99
        self.tau = 0.05  # for soft update of target parameters initial value was 0.01
示例#7
0
 def __init__(self, task):
     self.task = task
     self.stateSize = task.stateSize
     self.actionSize = task.actionSize
     self.actionLow = task.actionLow
     self.actionHigh = task.actionHigh
     self.localActor = Actor(self.stateSize, self.actionSize, self.actionLow, self.actionHigh)
     self.targetActor = Actor(self.stateSize, self.actionSize, self.actionLow, self.actionHigh)
     self.localCritic = Critic(self.stateSize, self.actionSize)
     self.targetCritic = Critic(self.stateSize, self.actionSize)
     self.targetCritic.model.set_weights(self.localCritic.model.get_weights())
     self.targetActor.model.set_weights(self.localActor.model.get_weights())
     self.explorationMu = 0.0
     self.explorationTheta = 0.15
     self.explorationSigma = 0.2
     self.noise = OUNoise(self.actionSize, self.explorationMu, self.explorationTheta, self.explorationSigma)
     self.buffer= 100000
     self.batchSize = 64
     self.memory = ReplayBuffer(self.buffer, self.batchSize)
     self.gamma = 0.99
     self.tau = 0.01
示例#8
0
    def __init__(self, task=None, type_name=None, name=None):
        '''
        Factory class for actor-critic models
        '''
        self.model = None
        self.name = name
        self.task = task
        self.type_name = type_name

        if type_name in ["actor"]:
            self.model = Actor(state_size=self.task.state_size,
                               action_size=self.task.action_size,
                               action_low=task.action_low,
                               action_high=task.action_high,
                               name=name)
        elif type_name in ["critic"]:
            self.model = Critic(state_size=self.task.state_size,
                                action_size=self.task.action_size,
                                name=name)
        else:
            print(
                "Wrong Model type - {} -, does not exist, therefore no model building possible."
                .format(type_name))
示例#9
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters

        self.total_reward = 0.0
        self.score = 0.0
        self.best_score = -np.inf

    def reset_episode(self):
        self.total_reward = 0.0
        self.score = 0.0

        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        self.total_reward += reward
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        self.score = self.total_reward
        if self.score > self.best_score:
            self.best_score = self.score

        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
示例#10
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG.

        Params
        ======
            task (Task): Instance of the Task class which reports the environment to this agent
            log (Log): Reference to the log utility.
    """
    def __init__(self, task, log=None):
        self.task = task
        #Add log utility
        self.log = log
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.log)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.log)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size, self.log)
        self.critic_target = Critic(self.state_size, self.action_size, self.log)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        # @hakimka on the forums said to use mu = 0.001, theta = 0.05, sigma = 0.0015, but...
        self.exploration_mu = 0.001
        self.exploration_theta = 0.05
        self.exploration_sigma = 0.0015
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 128
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size, self.log)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor 0.99
        self.tau = 0.01  # for soft update of target parameters  0.01

        # score
        self.total_reward = 0.0
        self.count = 0
        self.score = 0
        self.best_score = -np.inf

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        self.total_reward = 0.0
        self.count = 0
        return state

    def step(self, action, reward, next_state, done):
         # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)
        self.total_reward += reward
        self.count += 1
        #if self.log != None:
        #    self.log.write('DDPG.step len(self.memory)=' + str(len(self.memory)) + \
        #        ' total_reward=' + str(self.total_reward) + ' count=' + str(self.count))

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            #if self.log != None:
            #    self.log.write('DDPG.step len(self.memory) > self.batch_size' + \
            #        str(len(self.memory)) + '>' + str(self.batch_size))
            self.learn(experiences)
        #print(self.critic_local.model.get_weights()[0][0])
        #print(self.critic_target.model.get_weights()[0][0])
        #for lay in self.critic_local.model.layers:
        #    if lay.name == 'q_values':
        #        print(lay.name + ': ' + str(lay.get_weights()[0][5]))

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action + self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        #if self.log != None:
        #    self.log.write('DDPG.learn experiences=' + str(len(experiences)))
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack([e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        #actions_next_normal = ( actions_next - self.action_low ) / self.action_high
        Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next])
        #Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next_normal])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        #actions_normal = ( actions - self.action_low ) / self.action_high
        self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets)
        #self.critic_local.model.train_on_batch(x=[states, actions_normal], y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size))
        #action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions_normal, 0]), (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])  # custom training function
        if self.log != None:
            self.log.write('DDPG.learn Q_targets=' + str(Q_targets))
            self.log.write('DDPG.learn action_gradients=' + str(action_gradients))

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

        self.score = self.total_reward / float(self.count) if self.count else 0.0
        self.best_score = max(self.best_score, self.score)
        #if self.score > self.best_score:
        #    self.best_score = self.score

    def soft_update(self, local_model, target_model):
        """Soft update model parameters. As opposed to a fixed Q-targets method."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        target_model.set_weights(new_weights)
示例#11
0
class DDPG():
    def __init__(self, task):
        self.task = task
        self.stateSize = task.stateSize
        self.actionSize = task.actionSize
        self.actionLow = task.actionLow
        self.actionHigh = task.actionHigh
        self.localActor = Actor(self.stateSize, self.actionSize, self.actionLow, self.actionHigh)
        self.targetActor = Actor(self.stateSize, self.actionSize, self.actionLow, self.actionHigh)
        self.localCritic = Critic(self.stateSize, self.actionSize)
        self.targetCritic = Critic(self.stateSize, self.actionSize)
        self.targetCritic.model.set_weights(self.localCritic.model.get_weights())
        self.targetActor.model.set_weights(self.localActor.model.get_weights())
        self.explorationMu = 0.0
        self.explorationTheta = 0.15
        self.explorationSigma = 0.2
        self.noise = OUNoise(self.actionSize, self.explorationMu, self.explorationTheta, self.explorationSigma)
        self.buffer= 100000
        self.batchSize = 64
        self.memory = ReplayBuffer(self.buffer, self.batchSize)
        self.gamma = 0.99
        self.tau = 0.01

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last = state
        return state

    def step(self, action, reward, nextState, done):
        self.memory.add(self.last, action, reward, nextState, done)
        if len(self.memory) > self.batchSize:
            experiences = self.memory.sample()
            self.learn(experiences)
        self.last = nextState

    def act(self, state):
        state = np.reshape(state, [-1, self.stateSize])
        action = self.localActor.model.predict(state)[0]
        return list(action + self.noise.sample())

    def learn(self, experiences):
        states = np.vstack([exp.state for exp in experiences if exp is not None])

        actions = np.array([exp.action for exp in experiences if exp is not None]).astype(np.float32).reshape(-1, self.actionSize)

        rewards = np.array([exp.reward for exp in experiences if exp is not None]).astype(np.float32).reshape(-1, 1)

        dones = np.array([exp.done for exp in experiences if exp is not None]).astype(np.uint8).reshape(-1, 1)

        nextStates = np.vstack([exp.nextState for exp in experiences if exp is not None])

        next = self.targetActor.model.predict_on_batch(nextStates)

        QNext = self.targetCritic.model.predict_on_batch([nextStates, next])

        QTargets = rewards + self.gamma * QNext * (1 - dones)

        self.localCritic.model.train_on_batch(x=[states, actions], y=QTargets)

        Gradients = np.reshape(self.localCritic.get_action_gradients([states, actions, 0]), (-1, self.actionSize))

        self.localActor.train([states, Gradients, 1])

        self.soft(self.localCritic.model, self.targetCritic.model)

        self.soft(self.localActor.model, self.targetActor.model)   

    def soft(self, localModel, targetModel):
        local = np.array(localModel.get_weights())
        target = np.array(targetModel.get_weights())
        assert len(local) == len(target)
        new = self.tau * local + (1 - self.tau) * target
        targetModel.set_weights(new)
示例#12
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):

        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        self.action_range = self.action_high - self.action_low

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0.4
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.995  # discount factor
        self.tau = 0.9  # for soft update of target parameters

        # Score tracker and learning parameters
        self.best_w = None
        self.best_score = -np.inf
        self.noise_scale = 0.1
        self.w = np.random.normal(
            size=(
                self.state_size, self.action_size
            ),  # weights for simple linear policy: state_space x action_space
            scale=(self.action_range / (2 * self.state_size)
                   ))  # start producing actions in a decent range

        # Episode variables
        #         self.reset_episode()

        #load weight if existing
        try:
            self.actor_local.model.load_weights('actormodel.h5')
            self.actor_target.model.load_weights('actormodel.h5')
            self.critic_local.model.load_weights('criticmodel.h5')
            self.critic_target.model.load_weights('criticmodel.h5')
            print('Weight load successfully')
        except:
            print("Cannot find the weight")

    def reset_episode(self):
        self.total_reward = 0.0
        self.count = 0
        self.noise_scale = 0.1
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)
        self.total_reward += reward
        self.count += 1

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

        if done:
            self.score_update()

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)
#         print('actor weight: \n',self.critic_local.model.get_weights() )

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)

    def score_update(self):
        # Learn by random policy search, using a reward-based score
        self.score = self.total_reward / float(
            self.count) if self.count else 0.0

        if self.score > self.best_score:
            self.best_score = self.score
            self.best_w = self.w
            self.noise_scale = max(0.5 * self.noise_scale, 0.01)
            self.actor_local.model.save_weights("actormodel.h5",
                                                overwrite=True)
            #             with open("actormodel.json", "w") as outfile:
            #                 json.dump(self.actor_local.model.to_json(), outfile)

            self.critic_local.model.save_weights("criticmodel.h5",
                                                 overwrite=True)
#             with open("criticmodel.json", "w") as outfile:
#                 json.dump(self.critic_local.model.to_json(), outfile)
        else:
            self.w = self.best_w
            self.noise_scale = min(2.0 * self.noise_scale, 3.2)
        self.w = self.w + self.noise_scale * np.random.normal(
            size=self.w.shape)  # equal noise in all directions
示例#13
0
    def __init__(self, task):

        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        self.action_range = self.action_high - self.action_low

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0.4
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.995  # discount factor
        self.tau = 0.9  # for soft update of target parameters

        # Score tracker and learning parameters
        self.best_w = None
        self.best_score = -np.inf
        self.noise_scale = 0.1
        self.w = np.random.normal(
            size=(
                self.state_size, self.action_size
            ),  # weights for simple linear policy: state_space x action_space
            scale=(self.action_range / (2 * self.state_size)
                   ))  # start producing actions in a decent range

        # Episode variables
        #         self.reset_episode()

        #load weight if existing
        try:
            self.actor_local.model.load_weights('actormodel.h5')
            self.actor_target.model.load_weights('actormodel.h5')
            self.critic_local.model.load_weights('criticmodel.h5')
            self.critic_target.model.load_weights('criticmodel.h5')
            print('Weight load successfully')
        except:
            print("Cannot find the weight")
示例#14
0
class DDPG():
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        #Policy Model & Value Model
        self.actorLocal = Actor(self.state_size, self.action_size,
                                self.action_low, self.action_high)
        self.criticLocal = Critic(self.state_size, self.action_size)
        self.actorTarget = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.criticTarget = Critic(self.state_size, self.action_size)

        #Initializing target model with local model params
        self.criticTarget.model.set_weights(
            self.criticLocal.model.get_weights())
        self.actorTarget.model.set_weights(self.actorLocal.model.get_weights())

        #Replay Buffer
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        self.noise = OUNoise(self.action_size, 0, 0.1, 0.25)
        self.discountGamma = 0.9

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        self.memory.add(self.last_state, action, reward, next_state, done)
        if len(self.memory) > self.batch_size:
            exp = self.memory.sample()
            self.learn(exp)
        self.last_state = next_state

    def act(self, state):
        state = np.reshape(state, [-1, self.state_size])
        action = self.actorLocal.model.predict(state)[0]
        return list(action + self.noise.sample())

    def learn(self, exp):
        """
            https://docs.scipy.org/doc/numpy/reference/generated/numpy.vstack.html
            Vertical Stacking of arrays
            This took a long time to get in place :). Thanks to some other references in github too for examples. 
        """
        state = np.vstack([ex.state for ex in exp if ex is not None])
        action = np.array([ex.action for ex in exp
                           if ex is not None]).reshape(-1, self.action_size)
        reward = np.array([ex.reward for ex in exp
                           if ex is not None]).reshape(-1, 1)
        done = np.array([ex.done for ex in exp
                         if ex is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [ex.next_state for ex in exp if ex is not None])

        actions_next = self.actorTarget.model.predict_on_batch(next_states)
        QTargets_next = self.criticTarget.model.predict_on_batch(
            [next_states, actions_next])

        Q_targets = reward + self.discountGamma * QTargets_next * (1 - done)
        self.criticLocal.model.train_on_batch(x=[state, action], y=Q_targets)

        actionGradients = np.reshape(
            self.criticLocal.get_action_gradients([state, action, 0]),
            (-1, self.action_size))
        self.actorLocal.train_fn([state, actionGradients, 1])

        # Soft-update target models
        self.criticTarget.model.set_weights(
            0.01 * np.array(self.criticLocal.model.get_weights()) +
            (1 - 0.01) * np.array(self.criticTarget.model.get_weights()))
        self.actorTarget.model.set_weights(
            0.01 * np.array(self.actorLocal.model.get_weights()) +
            (1 - 0.01) * np.array(self.actorTarget.model.get_weights()))