Exemplo n.º 1
0
 def train_net(self):
     batch_size = self.hyperparams['batch_size']
     sample = self.experience_replay
     minibatches = create_minibatch_iterator({'sample': sample}, None,
                                             self.batch_preprocessor,
                                             batch_size)
     num_updates = self.hyperparams['updates_per_iter']
     epoch = 0
     self.recent_loss = deque(
         maxlen=int(self.hyperparams['num_recent_steps'] /
                    self.hyperparams['num_threads']))
     self.recent_policy_loss = deque(
         maxlen=int(self.hyperparams['num_recent_steps'] /
                    self.hyperparams['num_threads']))
     self.recent_value_loss = deque(
         maxlen=int(self.hyperparams['num_recent_steps'] /
                    self.hyperparams['num_threads']))
     print('-' * 40 + 'Epoch %d' % epoch + '-' * 40)
     for update_num in range(num_updates):
         try:
             batch = next(minibatches)
         except StopIteration:
             minibatches = create_minibatch_iterator(
                 {'sample': sample}, None, self.batch_preprocessor,
                 batch_size)
             batch = next(minibatches)
             epoch += 1
             print('-' * 40 + 'Epoch %d' % epoch + '-' * 40)
         display = update_num % self.hyperparams['display_freq'] == 0
         if display:
             print('Update num: %d' % update_num)
         self.train_model(batch[0], batch[1], batch[2], batch[3], display)
     self.experience_replay.clear()
     self.update_learning_rate()
Exemplo n.º 2
0
 def train_net(self):
     batch_size = self.hyperparams['batch_size']
     # sample = self.experience_replay.sample(
     #     len(self.experience_replay))
     sample = self.experience_replay
     minibatches = create_minibatch_iterator({'sample': sample}, None,
                                             self.batch_preprocessor,
                                             batch_size)
     num_updates = self.hyperparams['updates_per_iter']
     epoch = 0
     print('-' * 40 + 'Epoch %d' % epoch + '-' * 40)
     for update_num in range(num_updates):
         try:
             batch = next(minibatches)
         except StopIteration:
             minibatches = create_minibatch_iterator(
                 {'sample': sample}, None,
                 self.batch_preprocessor, batch_size)
             batch = next(minibatches)
             epoch += 1
             print('-' * 40 + 'Epoch %d' % epoch + '-' * 40)
         display = update_num % self.hyperparams['display_freq'] == 0
         if display:
             print('Update num: %d' % update_num)
         self.train_model(batch[0], batch[1], batch[2], display)
     self.experience_replay.clear()
     self.update_learning_rate()
Exemplo n.º 3
0
 def train_net(self, init=False):
     batch_size = self.hyperparams['batch_size']
     sample = self.experience_replay.get_all()
     minibatches = create_minibatch_iterator({'sample': sample}, None,
                                             self.batch_preprocessor,
                                             batch_size)
     if init:
         num_updates = self.hyperparams['init_updates']
     else:
         num_updates = self.hyperparams['updates_per_iter']
     epoch = 0
     avg_recent_loss = deque(maxlen=int(
         self.hyperparams['num_recent_steps'] /
         self.hyperparams['display_freq']))
     print('-' * 40 + 'Epoch %d' % epoch + '-' * 40)
     for update_num in range(num_updates):
         try:
             batch = next(minibatches)
         except StopIteration:
             minibatches = create_minibatch_iterator(
                 {'sample': sample}, None,
                 self.batch_preprocessor, batch_size)
             batch = next(minibatches)
             epoch += 1
             print('-' * 40 + 'Epoch %d' % epoch + '-' * 40)
         display = update_num % self.hyperparams['display_freq'] == 0
         if display:
             print('Update num: %d' % update_num)
         self.train_model(batch[0], batch[1], batch[2],
                          avg_recent_loss, display)
     self.experience_replay.clear()
     self.update_learning_rate()
Exemplo n.º 4
0
 def train_net(self):
     batch_size = self.hyperparams['batch_size']
     # sample = self.experience_replay.sample(
     #     len(self.experience_replay))
     sample = self.experience_replay
     minibatches = create_minibatch_iterator({'sample': sample}, None,
                                             self.batch_preprocessor,
                                             batch_size)
     num_updates = self.hyperparams['updates_per_iter']
     epoch = 0
     print('-' * 40 + 'Epoch %d' % epoch + '-' * 40)
     for update_num in range(num_updates):
         try:
             batch = next(minibatches)
         except StopIteration:
             minibatches = create_minibatch_iterator(
                 {'sample': sample}, None, self.batch_preprocessor,
                 batch_size)
             batch = next(minibatches)
             epoch += 1
             print('-' * 40 + 'Epoch %d' % epoch + '-' * 40)
         display = update_num % self.hyperparams['display_freq'] == 0
         if display:
             print('Update num: %d' % update_num)
         self.train_model(batch[0], batch[1], batch[2], display)
     self.experience_replay.clear()
     self.update_learning_rate()
Exemplo n.º 5
0
 def train_net(self):
     batch_size = self.hyperparams['batch_size']
     # sample = self.experience_replay.sample(
     #     len(self.experience_replay))
     train_xs, train_y, train_mask = self.sample_preprocessor(
         self.experience_replay)
     minibatches = create_minibatch_iterator(
         {'state': train_xs['state'],
          'train_y': train_y,
          'train_mask': train_mask},
         None, None, batch_size)
     num_updates = self.hyperparams['updates_per_iter']
     epoch = 0
     print('-' * 40 + 'Epoch %d' % epoch + '-' * 40)
     for update_num in range(num_updates):
         try:
             batch = next(minibatches)
         except StopIteration:
             minibatches = create_minibatch_iterator(
                 {'state': train_xs['state'],
                  'train_y': train_y,
                  'train_mask': train_mask},
                 None, None, batch_size)
             batch = next(minibatches)
             epoch += 1
             print('-' * 40 + 'Epoch %d' % epoch + '-' * 40)
         display = update_num % self.hyperparams['display_freq'] == 0
         if display:
             print('Update num: %d' % update_num)
         self.train_model({'state': batch['state']}, batch['train_y'],
                          batch['train_mask'], display)
     del minibatches
     del train_xs['state'], train_xs, train_y, train_mask
Exemplo n.º 6
0
 def train_net(self):
     batch_size = self.hyperparams['batch_size']
     sample = self.experience_replay
     minibatches = create_minibatch_iterator({'sample': sample}, None,
                                             self.batch_preprocessor,
                                             batch_size)
     num_updates = self.hyperparams['updates_per_iter']
     epoch = 0
     print('-' * 40 + 'Epoch %d' % epoch + '-' * 40)
     self.recent_loss = deque(maxlen=int(
         self.hyperparams['num_recent_steps'] /
         self.hyperparams['num_threads']))
     self.recent_value_loss = deque(maxlen=int(
         self.hyperparams['num_recent_steps'] /
         self.hyperparams['num_threads']))
     self.recent_policy_loss = deque(maxlen=int(
         self.hyperparams['num_recent_steps'] /
         self.hyperparams['num_threads']))
     for update_num in range(num_updates):
         try:
             batch = next(minibatches)
         except StopIteration:
             minibatches = create_minibatch_iterator(
                 {'sample': sample}, None,
                 self.batch_preprocessor, batch_size)
             batch = next(minibatches)
             epoch += 1
             print('-' * 40 + 'Epoch %d' % epoch + '-' * 40)
         display = update_num % self.hyperparams['display_freq'] == 0
         if display:
             print('-' * 40 + ' Update num: %d ' % update_num + '-' * 40)
         self.train_model(batch[0], batch[1], batch[2], batch[3], display)
     self.experience_replay.clear()
     self.update_learning_rate()
Exemplo n.º 7
0
 def train_net(self):
     batch_size = self.hyperparams['batch_size']
     # sample = self.experience_replay.sample(
     #     len(self.experience_replay))
     train_xs, train_y, train_mask = self.sample_preprocessor(
         self.experience_replay)
     minibatches = create_minibatch_iterator(
         {
             'state': train_xs['state'],
             'train_y': train_y,
             'train_mask': train_mask
         }, None, None, batch_size)
     num_updates = self.hyperparams['updates_per_iter']
     epoch = 0
     print('-' * 40 + 'Epoch %d' % epoch + '-' * 40)
     for update_num in range(num_updates):
         try:
             batch = next(minibatches)
         except StopIteration:
             minibatches = create_minibatch_iterator(
                 {
                     'state': train_xs['state'],
                     'train_y': train_y,
                     'train_mask': train_mask
                 }, None, None, batch_size)
             batch = next(minibatches)
             epoch += 1
             print('-' * 40 + 'Epoch %d' % epoch + '-' * 40)
         display = update_num % self.hyperparams['display_freq'] == 0
         if display:
             print('Update num: %d' % update_num)
         self.train_model({'state': batch['state']}, batch['train_y'],
                          batch['train_mask'], display)
     del minibatches
     del train_xs['state'], train_xs, train_y, train_mask
Exemplo n.º 8
0
 def train_partial_model(self, num_updates,
                         mega=False, display=False):
     batch_size = self.hyperparams['batch_size']
     rollout_len = self.hyperparams['value_rollout_length']
     if mega:
         sample = self.experience_replay.sample(
             len(self.experience_replay) - rollout_len - 1, rollout_len,
             decompress=False)
     else:
         sample = self.experience_replay.sample(
             batch_size, rollout_len, decompress=False)
     minibatches = create_minibatch_iterator({'sample': sample}, None,
                                             self.batch_preprocessor,
                                             batch_size)
     epoch = 0
     if mega:
         print('-' * 40 + 'Epoch %d' % epoch + '-' * 40)
     for update in range(num_updates):
         try:
             batch = next(minibatches)
         except StopIteration:
             minibatches = create_minibatch_iterator(
                 {'sample': sample}, None,
                 self.batch_preprocessor, batch_size)
             batch = next(minibatches)
             epoch += 1
             if mega:
                 print('-' * 40 + 'Epoch %d' % epoch + '-' * 40)
         feed_dict = self.make_feed_dict({'state': self.state_model,
                                          'reward': self.reward_model,
                                          'value': self.value_model},
                                         {'state': self.state_y,
                                          'reward': self.reward_y,
                                          'value': self.value_y},
                                         batch)
         self.sess.run(self.partial_train_step,
                       feed_dict=feed_dict)
         if mega and update % self.hyperparams['update_target_freq'] == 0:
             self.update_value_target_weights()
         if display and update % self.hyperparams['display_freq'] == 0:
             if mega:
                 print('Partial update %d' % update)
             self.state_loss_val = self.state_loss.loss.eval(
                 feed_dict=feed_dict, session=self.sess)
             self.reward_loss_val = self.reward_loss.loss.eval(
                 feed_dict=feed_dict, session=self.sess)
             self.value_loss_val = self.value_loss.loss.eval(
                 feed_dict=feed_dict, session=self.sess)
             self.display_train_update(update)
Exemplo n.º 9
0
 def recreate_batches():
     return create_minibatch_iterator(
         train_xs,
         train_y,
         self.model.train_batch_preprocessor,
         batch_size=self.hyperparams['batch_size'],
         train_mask=train_mask)
Exemplo n.º 10
0
    def sample_preprocessor(self, sample):
        # batch = batch['sample']
        reward_discount = self.hyperparams['reward_discount']
        batch_size = self.hyperparams['batch_size']

        train_xs = {'state': np.array([s for (s, a, r, n, v) in sample])}
        train_y = np.zeros((len(sample),))
        train_mask = np.zeros((len(sample), len(self.actions)))
        minibatches = create_minibatch_iterator(
            {'state': np.array([n for (s, a, r, n, v) in sample])},
            None, None, batch_size)
        preds = np.zeros((len(sample), len(self.actions)))
        for batch_num, batch in enumerate(minibatches):
            batch_preds = self.q_model.compute_preds(
                batch,
                sess=self.sess)
            batch_start = batch_size * batch_num
            preds[batch_start:batch_start + batch_size] = batch_preds
        del minibatches
        for experience_num, (state, action, reward, next_state, value) in \
                enumerate(sample):
            exp_returns = np.max(preds[experience_num])
            target = reward + value + reward_discount * exp_returns
            train_y[experience_num] = target
            train_mask[experience_num, np.argmax(action)] = 1
            del state, action, reward, next_state, value
        self.target = target
        self.exp_returns = exp_returns
        # self.mean_state_val = np.mean(np.array(
        #     [s for (s, a, r, n) in batch]))
        # self.mean_preds = np.mean(preds)
        return train_xs, train_y, train_mask
Exemplo n.º 11
0
    def sample_preprocessor(self, sample):
        # batch = batch['sample']
        reward_discount = self.hyperparams['reward_discount']
        batch_size = self.hyperparams['batch_size']

        train_xs = {'state': np.array([s for (s, a, r, n, v) in sample])}
        train_y = np.zeros((len(sample), ))
        train_mask = np.zeros((len(sample), len(self.actions)))
        minibatches = create_minibatch_iterator(
            {'state': np.array([n for (s, a, r, n, v) in sample])}, None, None,
            batch_size)
        preds = np.zeros((len(sample), len(self.actions)))
        for batch_num, batch in enumerate(minibatches):
            batch_preds = self.q_model.compute_preds(batch, sess=self.sess)
            batch_start = batch_size * batch_num
            preds[batch_start:batch_start + batch_size] = batch_preds
        del minibatches
        for experience_num, (state, action, reward, next_state, value) in \
                enumerate(sample):
            exp_returns = np.max(preds[experience_num])
            target = reward + value + reward_discount * exp_returns
            train_y[experience_num] = target
            train_mask[experience_num, np.argmax(action)] = 1
            del state, action, reward, next_state, value
        self.target = target
        self.exp_returns = exp_returns
        # self.mean_state_val = np.mean(np.array(
        #     [s for (s, a, r, n) in batch]))
        # self.mean_preds = np.mean(preds)
        return train_xs, train_y, train_mask
Exemplo n.º 12
0
    def compute_val_stats(self, model, val_xs, val_y):

        if self.accuracy is not None:
            minibatches = create_minibatch_iterator(
                val_xs,
                val_y,
                model.test_batch_preprocessor,
                batch_size=self.hyperparams['batch_size'])

            def fn(batch):
                return self.accuracy.compute(model, batch, self.y_var,
                                             batch['y'])

            acc = avg_over_batches(minibatches, fn)
            return {'acc': acc}
        else:
            return None
Exemplo n.º 13
0
    def compute_val_stats(self,
                          model,
                          val_xs, val_y):

        if self.accuracy is not None:
            minibatches = create_minibatch_iterator(
                val_xs, val_y, model.test_batch_preprocessor,
                batch_size=self.hyperparams['batch_size'])

            def fn(batch):
                return self.accuracy.compute(
                    model, batch, self.y_var, batch['y'])

            acc = avg_over_batches(minibatches, fn)
            return {'acc': acc}
        else:
            return None
Exemplo n.º 14
0
 def recreate_batches():
     return create_minibatch_iterator(
         train_xs, train_y,
         self.model.train_batch_preprocessor,
         batch_size=self.hyperparams['batch_size'],
         train_mask=train_mask)