def train_net(self): batch_size = self.hyperparams['batch_size'] sample = self.experience_replay minibatches = create_minibatch_iterator({'sample': sample}, None, self.batch_preprocessor, batch_size) num_updates = self.hyperparams['updates_per_iter'] epoch = 0 self.recent_loss = deque( maxlen=int(self.hyperparams['num_recent_steps'] / self.hyperparams['num_threads'])) self.recent_policy_loss = deque( maxlen=int(self.hyperparams['num_recent_steps'] / self.hyperparams['num_threads'])) self.recent_value_loss = deque( maxlen=int(self.hyperparams['num_recent_steps'] / self.hyperparams['num_threads'])) print('-' * 40 + 'Epoch %d' % epoch + '-' * 40) for update_num in range(num_updates): try: batch = next(minibatches) except StopIteration: minibatches = create_minibatch_iterator( {'sample': sample}, None, self.batch_preprocessor, batch_size) batch = next(minibatches) epoch += 1 print('-' * 40 + 'Epoch %d' % epoch + '-' * 40) display = update_num % self.hyperparams['display_freq'] == 0 if display: print('Update num: %d' % update_num) self.train_model(batch[0], batch[1], batch[2], batch[3], display) self.experience_replay.clear() self.update_learning_rate()
def train_net(self): batch_size = self.hyperparams['batch_size'] # sample = self.experience_replay.sample( # len(self.experience_replay)) sample = self.experience_replay minibatches = create_minibatch_iterator({'sample': sample}, None, self.batch_preprocessor, batch_size) num_updates = self.hyperparams['updates_per_iter'] epoch = 0 print('-' * 40 + 'Epoch %d' % epoch + '-' * 40) for update_num in range(num_updates): try: batch = next(minibatches) except StopIteration: minibatches = create_minibatch_iterator( {'sample': sample}, None, self.batch_preprocessor, batch_size) batch = next(minibatches) epoch += 1 print('-' * 40 + 'Epoch %d' % epoch + '-' * 40) display = update_num % self.hyperparams['display_freq'] == 0 if display: print('Update num: %d' % update_num) self.train_model(batch[0], batch[1], batch[2], display) self.experience_replay.clear() self.update_learning_rate()
def train_net(self, init=False): batch_size = self.hyperparams['batch_size'] sample = self.experience_replay.get_all() minibatches = create_minibatch_iterator({'sample': sample}, None, self.batch_preprocessor, batch_size) if init: num_updates = self.hyperparams['init_updates'] else: num_updates = self.hyperparams['updates_per_iter'] epoch = 0 avg_recent_loss = deque(maxlen=int( self.hyperparams['num_recent_steps'] / self.hyperparams['display_freq'])) print('-' * 40 + 'Epoch %d' % epoch + '-' * 40) for update_num in range(num_updates): try: batch = next(minibatches) except StopIteration: minibatches = create_minibatch_iterator( {'sample': sample}, None, self.batch_preprocessor, batch_size) batch = next(minibatches) epoch += 1 print('-' * 40 + 'Epoch %d' % epoch + '-' * 40) display = update_num % self.hyperparams['display_freq'] == 0 if display: print('Update num: %d' % update_num) self.train_model(batch[0], batch[1], batch[2], avg_recent_loss, display) self.experience_replay.clear() self.update_learning_rate()
def train_net(self): batch_size = self.hyperparams['batch_size'] # sample = self.experience_replay.sample( # len(self.experience_replay)) train_xs, train_y, train_mask = self.sample_preprocessor( self.experience_replay) minibatches = create_minibatch_iterator( {'state': train_xs['state'], 'train_y': train_y, 'train_mask': train_mask}, None, None, batch_size) num_updates = self.hyperparams['updates_per_iter'] epoch = 0 print('-' * 40 + 'Epoch %d' % epoch + '-' * 40) for update_num in range(num_updates): try: batch = next(minibatches) except StopIteration: minibatches = create_minibatch_iterator( {'state': train_xs['state'], 'train_y': train_y, 'train_mask': train_mask}, None, None, batch_size) batch = next(minibatches) epoch += 1 print('-' * 40 + 'Epoch %d' % epoch + '-' * 40) display = update_num % self.hyperparams['display_freq'] == 0 if display: print('Update num: %d' % update_num) self.train_model({'state': batch['state']}, batch['train_y'], batch['train_mask'], display) del minibatches del train_xs['state'], train_xs, train_y, train_mask
def train_net(self): batch_size = self.hyperparams['batch_size'] sample = self.experience_replay minibatches = create_minibatch_iterator({'sample': sample}, None, self.batch_preprocessor, batch_size) num_updates = self.hyperparams['updates_per_iter'] epoch = 0 print('-' * 40 + 'Epoch %d' % epoch + '-' * 40) self.recent_loss = deque(maxlen=int( self.hyperparams['num_recent_steps'] / self.hyperparams['num_threads'])) self.recent_value_loss = deque(maxlen=int( self.hyperparams['num_recent_steps'] / self.hyperparams['num_threads'])) self.recent_policy_loss = deque(maxlen=int( self.hyperparams['num_recent_steps'] / self.hyperparams['num_threads'])) for update_num in range(num_updates): try: batch = next(minibatches) except StopIteration: minibatches = create_minibatch_iterator( {'sample': sample}, None, self.batch_preprocessor, batch_size) batch = next(minibatches) epoch += 1 print('-' * 40 + 'Epoch %d' % epoch + '-' * 40) display = update_num % self.hyperparams['display_freq'] == 0 if display: print('-' * 40 + ' Update num: %d ' % update_num + '-' * 40) self.train_model(batch[0], batch[1], batch[2], batch[3], display) self.experience_replay.clear() self.update_learning_rate()
def train_net(self): batch_size = self.hyperparams['batch_size'] # sample = self.experience_replay.sample( # len(self.experience_replay)) train_xs, train_y, train_mask = self.sample_preprocessor( self.experience_replay) minibatches = create_minibatch_iterator( { 'state': train_xs['state'], 'train_y': train_y, 'train_mask': train_mask }, None, None, batch_size) num_updates = self.hyperparams['updates_per_iter'] epoch = 0 print('-' * 40 + 'Epoch %d' % epoch + '-' * 40) for update_num in range(num_updates): try: batch = next(minibatches) except StopIteration: minibatches = create_minibatch_iterator( { 'state': train_xs['state'], 'train_y': train_y, 'train_mask': train_mask }, None, None, batch_size) batch = next(minibatches) epoch += 1 print('-' * 40 + 'Epoch %d' % epoch + '-' * 40) display = update_num % self.hyperparams['display_freq'] == 0 if display: print('Update num: %d' % update_num) self.train_model({'state': batch['state']}, batch['train_y'], batch['train_mask'], display) del minibatches del train_xs['state'], train_xs, train_y, train_mask
def train_partial_model(self, num_updates, mega=False, display=False): batch_size = self.hyperparams['batch_size'] rollout_len = self.hyperparams['value_rollout_length'] if mega: sample = self.experience_replay.sample( len(self.experience_replay) - rollout_len - 1, rollout_len, decompress=False) else: sample = self.experience_replay.sample( batch_size, rollout_len, decompress=False) minibatches = create_minibatch_iterator({'sample': sample}, None, self.batch_preprocessor, batch_size) epoch = 0 if mega: print('-' * 40 + 'Epoch %d' % epoch + '-' * 40) for update in range(num_updates): try: batch = next(minibatches) except StopIteration: minibatches = create_minibatch_iterator( {'sample': sample}, None, self.batch_preprocessor, batch_size) batch = next(minibatches) epoch += 1 if mega: print('-' * 40 + 'Epoch %d' % epoch + '-' * 40) feed_dict = self.make_feed_dict({'state': self.state_model, 'reward': self.reward_model, 'value': self.value_model}, {'state': self.state_y, 'reward': self.reward_y, 'value': self.value_y}, batch) self.sess.run(self.partial_train_step, feed_dict=feed_dict) if mega and update % self.hyperparams['update_target_freq'] == 0: self.update_value_target_weights() if display and update % self.hyperparams['display_freq'] == 0: if mega: print('Partial update %d' % update) self.state_loss_val = self.state_loss.loss.eval( feed_dict=feed_dict, session=self.sess) self.reward_loss_val = self.reward_loss.loss.eval( feed_dict=feed_dict, session=self.sess) self.value_loss_val = self.value_loss.loss.eval( feed_dict=feed_dict, session=self.sess) self.display_train_update(update)
def recreate_batches(): return create_minibatch_iterator( train_xs, train_y, self.model.train_batch_preprocessor, batch_size=self.hyperparams['batch_size'], train_mask=train_mask)
def sample_preprocessor(self, sample): # batch = batch['sample'] reward_discount = self.hyperparams['reward_discount'] batch_size = self.hyperparams['batch_size'] train_xs = {'state': np.array([s for (s, a, r, n, v) in sample])} train_y = np.zeros((len(sample),)) train_mask = np.zeros((len(sample), len(self.actions))) minibatches = create_minibatch_iterator( {'state': np.array([n for (s, a, r, n, v) in sample])}, None, None, batch_size) preds = np.zeros((len(sample), len(self.actions))) for batch_num, batch in enumerate(minibatches): batch_preds = self.q_model.compute_preds( batch, sess=self.sess) batch_start = batch_size * batch_num preds[batch_start:batch_start + batch_size] = batch_preds del minibatches for experience_num, (state, action, reward, next_state, value) in \ enumerate(sample): exp_returns = np.max(preds[experience_num]) target = reward + value + reward_discount * exp_returns train_y[experience_num] = target train_mask[experience_num, np.argmax(action)] = 1 del state, action, reward, next_state, value self.target = target self.exp_returns = exp_returns # self.mean_state_val = np.mean(np.array( # [s for (s, a, r, n) in batch])) # self.mean_preds = np.mean(preds) return train_xs, train_y, train_mask
def sample_preprocessor(self, sample): # batch = batch['sample'] reward_discount = self.hyperparams['reward_discount'] batch_size = self.hyperparams['batch_size'] train_xs = {'state': np.array([s for (s, a, r, n, v) in sample])} train_y = np.zeros((len(sample), )) train_mask = np.zeros((len(sample), len(self.actions))) minibatches = create_minibatch_iterator( {'state': np.array([n for (s, a, r, n, v) in sample])}, None, None, batch_size) preds = np.zeros((len(sample), len(self.actions))) for batch_num, batch in enumerate(minibatches): batch_preds = self.q_model.compute_preds(batch, sess=self.sess) batch_start = batch_size * batch_num preds[batch_start:batch_start + batch_size] = batch_preds del minibatches for experience_num, (state, action, reward, next_state, value) in \ enumerate(sample): exp_returns = np.max(preds[experience_num]) target = reward + value + reward_discount * exp_returns train_y[experience_num] = target train_mask[experience_num, np.argmax(action)] = 1 del state, action, reward, next_state, value self.target = target self.exp_returns = exp_returns # self.mean_state_val = np.mean(np.array( # [s for (s, a, r, n) in batch])) # self.mean_preds = np.mean(preds) return train_xs, train_y, train_mask
def compute_val_stats(self, model, val_xs, val_y): if self.accuracy is not None: minibatches = create_minibatch_iterator( val_xs, val_y, model.test_batch_preprocessor, batch_size=self.hyperparams['batch_size']) def fn(batch): return self.accuracy.compute(model, batch, self.y_var, batch['y']) acc = avg_over_batches(minibatches, fn) return {'acc': acc} else: return None
def compute_val_stats(self, model, val_xs, val_y): if self.accuracy is not None: minibatches = create_minibatch_iterator( val_xs, val_y, model.test_batch_preprocessor, batch_size=self.hyperparams['batch_size']) def fn(batch): return self.accuracy.compute( model, batch, self.y_var, batch['y']) acc = avg_over_batches(minibatches, fn) return {'acc': acc} else: return None