Exemplo n.º 1
0
 def __init__(self, task, config):
     Algorithm.__init__(self, task, config)
     # Scale parameters.
     assert self.config.start_learning <= self.config.replay_capacity
     assert self.config.start_learning >= self.config.batch_size
     self.config.start_learning *= self.config.frame_skip
     self.config.sync_target *= self.config.frame_skip
     self.config.epsilon.over *= self.config.frame_skip
     # Preprocessing.
     self._preprocess = self._create_preprocess()
     Experience.__init__(self, self._preprocess.above_task)
     # Network.
     self._model = Model(self._create_network)
     self._target = Model(self._create_network)
     self._target.weights = self._model.weights
     self._sync_target = Every(self.config.sync_target,
                               self.config.start_learning)
     print(str(self._model))
     # Learning.
     observ_shape = self._preprocess.above_task.observs.shape
     shapes = (observ_shape, tuple(), tuple(), observ_shape)
     self._memory = Memory(self.config.replay_capacity, shapes)
     self._log_memory_size()
     self._learning_rate = Decay(self.config.initial_learning_rate, 0,
                                 self.task.steps)
     self._cost_metric = Metric(self.task, 'dqn/cost', 1)
     self._learning_rate_metric = Metric(self.task, 'dqn/learning_rate', 1)
Exemplo n.º 2
0
 def __init__(self, task, config):
     Algorithm.__init__(self, task, config)
     # Parse parameters (until YAML 1.2 support).
     self.config.start_learning = int(float(self.config.start_learning))
     self.config.sync_target = int(float(self.config.sync_target))
     self.config.epsilon.over = int(float(self.config.epsilon.over))
     # Scale parameters.
     assert self.config.start_learning <= self.config.replay_capacity
     assert self.config.start_learning >= self.config.batch_size
     self.config.start_learning *= self.config.frame_skip
     self.config.sync_target *= self.config.frame_skip
     self.config.epsilon.over *= self.config.frame_skip
     # Preprocessing.
     self._preprocess = self._create_preprocess()
     Experience.__init__(self, self._preprocess.above_task)
     # Network.
     self._model = Model(self._create_network)
     self._target = Model(self._create_network)
     self._target.weights = self._model.weights
     self._sync_target = Every(self.config.sync_target,
                               self.config.start_learning)
     # print(str(self._model))
     # Learning.
     self._memory = Memory(int(float(self.config.replay_capacity)))
     self._learning_rate = Decay(float(self.config.initial_learning_rate),
                                 0, self.task.steps)
     self._cost_metric = Metric(self.task, 'dqn/cost', 1)
     self._learning_rate_metric = Metric(self.task, 'dqn/learning_rate', 1)
Exemplo n.º 3
0
 def __init__(self, task, config):
     super().__init__(task, config)
     self._preprocess = self._create_preprocess()
     self.model = Model(self._create_network)
     # print(str(self.model))
     self.learning_rate = Decay(
         float(config.initial_learning_rate), 0, self.task.steps)
     self.costs = None
     self.values = None
     self.choices = None
Exemplo n.º 4
0
 def train_policies(self):
     trainers = []
     for _ in range(self.config.learners):
         config = AttrDict(self.config.copy())
         model = Model(self._create_network, threads=1)
         model.weights = self.model.weights
         policy = Sequential(self.task)
         policy.add(self._create_preprocess())
         policy.add(Train, config, self, model)
         trainers.append(policy)
     return trainers
Exemplo n.º 5
0
class A3C(Algorithm):
    """
    Algorithm: Asynchronous Advantage Actor Critic (A3C)
    Paper: Asynchronous Methods for Deep Reinforcement Learning
    Authors: Mnih et al. 2016
    PDF: https://arxiv.org/pdf/1602.01783v2.pdf
    """
    @classmethod
    def defaults(cls):
        # Preprocessing.
        subsample = 2
        frame_skip = 4
        history = 4
        delta = False
        frame_max = 2
        noop_max = 30
        # Architecture.
        learners = 16
        apply_gradient = 5
        network = 'network_a3c_lstm'
        scale_critic_loss = 0.5
        regularize = 0.01
        # Optimizer.
        initial_learning_rate = 7e-4
        optimizer = tf.train.RMSPropOptimizer
        rms_decay = 0.99
        rms_epsilon = 0.1
        return merge_dicts(super().defaults(), locals())

    def __init__(self, task, config):
        super().__init__(task, config)
        self._preprocess = self._create_preprocess()
        self.model = Model(self._create_network)
        # print(str(self.model))
        self.learning_rate = Decay(float(config.initial_learning_rate), 0,
                                   self.task.steps)
        self.lock = Lock()
        self.costs = None
        self.values = None
        self.choices = None

    @property
    def train_policies(self):
        trainers = []
        for _ in range(self.config.learners):
            config = AttrDict(self.config.copy())
            # TODO: Use single model to share RMSProp statistics.
            model = Model(self._create_network, threads=1)
            model.weights = self.model.weights
            policy = Sequential(self.task)
            policy.add(self._create_preprocess())
            policy.add(Train, config, self, model)
            trainers.append(policy)
        return trainers

    @property
    def test_policy(self):
        policy = Sequential(self.task)
        policy.add(self._preprocess)
        policy.add(Test, self.model)
        return policy

    def begin_epoch(self):
        super().begin_epoch()
        self.costs = []
        self.values = []
        self.choices = []

    def end_epoch(self):
        super().end_epoch()
        if self.costs:
            average = sum(self.costs) / len(self.costs)
            print('Cost  {:12.5f}'.format(average))
        if self.values:
            average = sum(self.values) / len(self.values)
            print('Value {:12.5f}'.format(average))
        if self.choices:
            dist = np.bincount(self.choices) / len(self.choices)
            dist = ' '.join('{:.2f}'.format(x) for x in dist)
            print('Choices [{}]'.format(dist))
        if self.task.directory:
            self.model.save(self.task.directory, 'model')

    def _create_network(self, model):
        observs = self._preprocess.above_task.observs
        actions = self._preprocess.above_task.actions
        # Perception.
        state = model.add_input('state', observs.shape)
        hidden = getattr(networks, self.config.network)(model, state)
        value = model.add_output(
            'value', tf.squeeze(dense(hidden, 1, tf.identity), [1]))
        policy = dense(value, actions.n, tf.nn.softmax)
        model.add_output('choice',
                         tf.squeeze(tf.multinomial(tf.log(policy), 1), [1]))
        # Objectives.
        action = model.add_input('action', type_=tf.int32)
        action = tf.one_hot(action, actions.n)
        return_ = model.add_input('return_')
        logprob = tf.log(tf.reduce_sum(policy * action, 1) + 1e-13)
        entropy = -tf.reduce_sum(tf.log(policy + 1e-13) * policy)
        advantage = tf.stop_gradient(return_ - value)
        actor = advantage * logprob + self.config.regularize * entropy
        critic = self.config.scale_critic_loss * (return_ - value)**2 / 2
        # Training.
        learning_rate = model.add_option(
            'learning_rate', float(self.config.initial_learning_rate))
        model.set_optimizer(
            self.config.optimizer(learning_rate,
                                  self.config.rms_decay,
                                  use_locking=True))
        model.add_cost('cost', critic - actor)

    def _create_preprocess(self):
        policy = Sequential(self.task)
        if self.config.noop_max:
            policy.add(RandomStart, self.config.noop_max)
        if self.config.frame_skip:
            policy.add(Skip, self.config.frame_skip)
        if self.config.frame_max:
            policy.add(Maximum, self.config.frame_max)
        if self.config.history:
            policy.add(Grayscale)
        if self.config.subsample > 1:
            sub = self.config.subsample
            amount = (sub, sub) if self.config.history else (sub, sub, 1)
            policy.add(Subsample, amount)
        if self.config.delta:
            policy.add(Delta)
        if self.config.history:
            policy.add(History, self.config.history)
        policy.add(ClampReward)
        policy.add(Normalize)
        return policy
Exemplo n.º 6
0
class DQN(Algorithm, Experience):
    """
    Algorithm: Deep Q-Network (DQN)
    Paper: Human-level control through deep reinforcement learning
    Authors: Mnih et al. 2015
    PDF: https://goo.gl/Y3e373
    """
    @classmethod
    def defaults(cls):
        # Preprocessing.
        subsample = 2
        frame_skip = 4
        history = 4
        delta = False
        frame_max = 2
        noop_max = 30
        # Architecture.
        network = 'network_dqn_2015'
        replay_capacity = 1e5  # 1e6
        start_learning = 5e4
        # Exploration.
        epsilon = dict(from_=1.0,
                       to=0.1,
                       test=0.05,
                       over=1e6,
                       offset=start_learning)
        # Learning.
        batch_size = 32
        sync_target = 2500
        # Optimizer.
        initial_learning_rate = 2.5e-4
        optimizer = tf.train.RMSPropOptimizer
        rms_decay = 0.95
        rms_epsilon = 0.1
        return merge_dicts(super().defaults(), locals())

    def __init__(self, task, config):
        Algorithm.__init__(self, task, config)
        # Scale parameters.
        assert self.config.start_learning <= self.config.replay_capacity
        assert self.config.start_learning >= self.config.batch_size
        self.config.start_learning *= self.config.frame_skip
        self.config.sync_target *= self.config.frame_skip
        self.config.epsilon.over *= self.config.frame_skip
        # Preprocessing.
        self._preprocess = self._create_preprocess()
        Experience.__init__(self, self._preprocess.above_task)
        # Network.
        self._model = Model(self._create_network)
        self._target = Model(self._create_network)
        self._target.weights = self._model.weights
        self._sync_target = Every(self.config.sync_target,
                                  self.config.start_learning)
        print(str(self._model))
        # Learning.
        observ_shape = self._preprocess.above_task.observs.shape
        shapes = (observ_shape, tuple(), tuple(), observ_shape)
        self._memory = Memory(self.config.replay_capacity, shapes)
        self._log_memory_size()
        self._learning_rate = Decay(self.config.initial_learning_rate, 0,
                                    self.task.steps)
        self._cost_metric = Metric(self.task, 'dqn/cost', 1)
        self._learning_rate_metric = Metric(self.task, 'dqn/learning_rate', 1)

    def end_epoch(self):
        super().end_epoch()
        if self.task.directory:
            self._model.save(self.task.directory, 'model')

    def perform(self, observ):
        return self._model.compute('values', state=observ)

    def experience(self, observ, action, reward, successor):
        action = action.argmax()
        self._memory.append((observ, action, reward, successor))
        if self.task.step < self.config.start_learning:
            return
        observ, action, reward, successor = \
            self._memory.sample(self.config.batch_size)
        target = self._compute_target(reward, successor)
        if self._sync_target(self.task.step):
            self._target.weights = self._model.weights
        self._model.set_option('learning_rate',
                               self._learning_rate(self.task.step))
        cost = self._model.train('cost',
                                 state=observ,
                                 action=action,
                                 target=target)
        self._learning_rate_metric(self._model.get_option('learning_rate'))
        self._cost_metric(cost)

    @property
    def policy(self):
        # TODO: Why doesn't self.task work here?
        policy = Sequential(self._preprocess.task)
        policy.add(self._preprocess)
        policy.add(self)
        return policy

    def _create_preprocess(self):
        policy = Sequential(self.task)
        policy.add(Image)
        if self.config.noop_max:
            policy.add(RandomStart, self.config.noop_max)
        if self.config.frame_skip > 1:
            policy.add(Skip, self.config.frame_skip)
        if self.config.frame_max:
            policy.add(Maximum, self.config.frame_max)
        if self.config.history > 1:
            channels = policy.above_task.observs.shape[-1]
            policy.add(Grayscale, (0.299, 0.587, 0.114)[:channels])
        if self.config.subsample > 1:
            sub = self.config.subsample
            amount = (sub, sub) if self.config.history > 1 else (sub, sub, 1)
            policy.add(Subsample, amount)
        if self.config.delta:
            policy.add(Delta)
        if self.config.history > 1:
            policy.add(History, self.config.history)
        policy.add(Normalize)
        policy.add(ClampReward)
        policy.add(EpsilonGreedy, **self.config.epsilon)
        return policy

    def _create_network(self, model):
        observs = self._preprocess.above_task.observs.shape
        actions = self._preprocess.above_task.actions.shape[0]
        # Percetion.
        state = model.add_input('state', observs)
        hidden = getattr(networks, self.config.network)(model, state)
        values = dense(hidden, actions, tf.identity)
        values = model.add_output('values', values)
        # Training.
        action = model.add_input('action', type_=tf.int32)
        action = tf.one_hot(action, actions)
        target = model.add_input('target')
        model.add_output('value', tf.reduce_max(values, 1))
        # Opimization.
        learning_rate = model.add_option('learning_rate',
                                         self.config.initial_learning_rate)
        model.set_optimizer(
            self.config.optimizer(learning_rate=learning_rate,
                                  decay=self.config.rms_decay,
                                  epsilon=self.config.rms_epsilon))
        model.add_cost('cost', (tf.reduce_sum(action * values, 1) - target)**2)

    def _compute_target(self, reward, successor):
        terminal = np.isnan(successor.reshape((len(successor), -1))).any(1)
        successor = np.nan_to_num(successor)
        assert np.isfinite(successor).all()
        future = self._target.compute('value', state=successor)
        future[terminal] = 0
        target = reward + self.config.discount * future
        assert np.isfinite(target).all()
        return target

    def _log_memory_size(self):
        size = self._memory.nbytes / (1024**3)
        print('Replay memory size', round(size, 2), 'GB')