class VFIB(keras.Model): def __init__(self, encoder, predictor, feature_dim,loss_type, **kwargs): super(VFIB, self).__init__(**kwargs) self.encoder = encoder self.classifier = predictor self.loss_type = loss_type self.total_loss_tracker = Mean(name="total_loss") self.prediction_loss_tracker = Mean(name="prediction_loss") self.kl_loss_tracker = Mean(name="kl_loss") self.mmd_loss_tracker = Mean(name="mmd_loss") @property def metrics(self): return [ self.total_loss_tracker, self.prediction_loss_tracker, self.kl_loss_tracker, self.mmd_loss_tracker ] def call(self, inputs): # 0 refers to first column with sensitive feature 'Age' sens, _ = split_sensitive_X(inputs, 0, 1) mu, sig, z = self.encoder(inputs) preds = self.classifier(tf.concat([z, sens], 1)) return mu, sig, z, preds def train_step(self, data): X, y = data with tf.GradientTape() as tape: z_mean, z_log_sigma, z, preds = self.call(X) prediction_loss = neg_log_bernoulli(y, preds) kl_loss = KL(z_mean, z_log_sigma) mmd_loss = mmd_loss(X, z) if self.loss_type=='all': total_loss = prediction_loss+ kl_loss + mmd_loss elif self.loss_type=='kl': total_loss = prediction_loss+ kl_loss else: total_loss = prediction_loss grads = tape.gradient(total_loss, self.trainable_weights) self.optimizer.apply_gradients(zip(grads, self.trainable_weights)) self.total_loss_tracker.update_state(total_loss) self.prediction_loss_tracker.update_state(prediction_loss) self.kl_loss_tracker.update_state(kl_loss) self.mmd_loss_tracker.update_state(mmd_loss) return { "loss": self.total_loss_tracker.result(), "classification_loss": self.prediction_loss_tracker.result(), "kl_loss": self.kl_loss_tracker.result(), "mmd_loss": self.mmd_loss_tracker.result() }
def train_wgain(dataset, gain, n_epoch, n_critic, alpha): '''Train wgain function Args: - dataset: A dataset TF2 object. - gain: a gain model. - n_epoch: number of iterations. - alpha: hyper-parameter Returns: - gain: Trained model - critic loss, generator loss and reconstruction loss for monitoring. ''' generator, discriminator = gain.layers d_optimizer = keras.optimizers.RMSprop(lr=0.00005) g_optimizer = keras.optimizers.Adam() # Keep results for plotting train_d_loss_results = [] train_g_loss_results = [] train_rec_loss_results = [] for epoch in range(n_epoch): epoch_d_loss_avg = Mean() epoch_g_loss_avg = Mean() epoch_rec_loss_avg = Mean() for x_batch, mask_batch in dataset: batch_size, dim = x_batch.shape # phase 1: train discriminator for _ in range(n_critic): hint = hint_generator(x_batch, mask_batch) generated_samples = generator(hint, training = True) discriminator.trainable = True d_loss, d_grads = discriminator_grad(discriminator, generated_samples, mask_batch[:,1:]) d_optimizer.apply_gradients(zip(d_grads, discriminator.trainable_variables)) # phase 2 - training the generator hint = hint_generator(x_batch, mask_batch) discriminator.trainable = False g_loss, g_grads = gain_grad(gain, hint) d_optimizer.apply_gradients(zip(g_grads, gain.trainable_variables)) hint = hint_generator(x_batch, mask_batch) rec_loss, rec_grads = rec_grad(generator, hint, mask_batch, alpha) g_optimizer.apply_gradients(zip(rec_grads, gain.trainable_variables)) # Track progress: Add current batch loss epoch_d_loss_avg.update_state(d_loss) epoch_g_loss_avg.update_state(g_loss) epoch_rec_loss_avg.update_state(rec_loss) # End epoch train_d_loss_results.append(epoch_d_loss_avg.result()) train_g_loss_results.append(epoch_g_loss_avg.result()) train_rec_loss_results.append(epoch_rec_loss_avg.result()) return gain, train_d_loss_results, train_g_loss_results, train_rec_loss_results
def train_rgan(gan_model, dataset, n_epochs): generator_optimizer = keras.optimizers.Adam(lr=0.01, beta_1=0.9, beta_2=0.999, clipnorm=1.) discriminator_optimizer = keras.optimizers.SGD(lr=0.1, momentum=0.9, nesterov=True, clipnorm=1.) recurrent_generator, recurrent_discriminator = gan_model.layers # Keep results for plotting train_discriminator_loss_results = [] train_generator_loss_results = [] for epoch in range(n_epochs): epoch_discriminator_loss_avg = Mean() epoch_generator_loss_avg = Mean() for x_batch, mask_batch in dataset: no, seq_len , dim = x_batch.shape x_batch = cast(x_batch, float32) # phase 1 - training the discriminator noise = noise_generator(no, seq_len, dim) generated_samples = recurrent_generator(noise) x_fake_and_real = concat([generated_samples, x_batch], axis=1) y1 = cast(reshape(constant([[0.]] * seq_len + [[1.]] * seq_len), [seq_len*2, 1]), float32) y1 = tf.broadcast_to(y1, [no, seq_len*2, 1]) mask1 = tf.ones([no, seq_len]) mask_fake_and_real = concat([mask1, mask_batch], axis=1) recurrent_discriminator.trainable = True discriminator_loss_value, discriminator_grads = grad(recurrent_discriminator, x_fake_and_real, y1, mask_fake_and_real) discriminator_optimizer.apply_gradients(zip(discriminator_grads, recurrent_discriminator.trainable_variables)) # phase 2 - training the generator noise = noise_generator(no, seq_len, dim) y2 = cast(reshape( constant([[1.]] * seq_len), [seq_len, 1]), float32) y2 = tf.broadcast_to(y2, [no, seq_len, 1]) recurrent_discriminator.trainable = False generator_loss_value, generator_grads = grad(gan_model, noise, y2, mask1) generator_optimizer.apply_gradients(zip(generator_grads, gan_model.trainable_variables)) # Track progress: Add current batch loss epoch_discriminator_loss_avg.update_state(discriminator_loss_value) epoch_generator_loss_avg.update_state(generator_loss_value) # End epoch train_discriminator_loss_results.append(epoch_discriminator_loss_avg.result()) train_generator_loss_results.append(epoch_generator_loss_avg.result()) if epoch % 50 == 0: print("RGAN Epoch {:03d}: Discriminator Loss: {:.3f}".format(epoch, epoch_discriminator_loss_avg.result() ) , file=sys.stdout) print("RGAN Epoch {:03d}: Generator Loss: {:.3f}".format(epoch, epoch_generator_loss_avg.result() ) , file=sys.stdout) return gan_model, train_discriminator_loss_results, train_generator_loss_results
def train_gan(self, train_ds, epochs, print_every, save_every, log_filename, model_save_name): pls_metric = Mean() dls_metric = Mean() log_file = open(os.path.join(LOG_DIR, '{}.txt'.format(log_filename)), 'w+') log_file.close() print('----- Start training -----') epoch = 0 for lr, hr in train_ds.take(epochs): epoch += 1 step_time = time.time() generator_loss, discriminator_loss = self.train_step(lr, hr) # Apply metrics pls_metric(generator_loss) dls_metric(discriminator_loss) # Update log every 100 epochs if epoch == 1 or epoch % print_every == 0: print( 'Epoch {}/{}, time: {:.3f}s, generator loss = {:.4f}, discriminator loss = {:.4f}' .format(epoch, epochs, time.time() - step_time, pls_metric.result(), dls_metric.result())) log_file = open( os.path.join(LOG_DIR, '{}.txt'.format(log_filename)), 'a') log_file.write( 'Epoch {}/{}, time: {:.3f}s, generator loss = {:.4f}, discriminator loss = {:.4f}\n' .format(epoch, epochs, time.time() - step_time, pls_metric.result(), dls_metric.result())) log_file.close() pls_metric.reset_states() dls_metric.reset_states() # Save model every 500 epochs if epoch % save_every == 0: generator.save(model_save_dir + '/gen_{}_{}.h5'.format(model_save_name, epoch)) discriminator.save( model_save_dir + '/dis_{}_{}.h5'.format(model_save_name, epoch))
def train(model, train_dataset, test_dataset, epochs, optimizer): # statistics to store elbos = [] ssims = [] print('Starting training...') # iterate over all epochs for epoch in range(0, epochs + 1): # iterate over train_dataset containing training images for x_train in train_dataset: train_step(model, x_train, optimizer) # feed the network test samples to generate new images predictions = model.generate_images(model, test_dataset) # display the results try: display_result(predictions) except: pass loss = Mean() for test_x in test_dataset: loss(calculate_loss(model, test_x)) elbo = -loss.result() # evaluate the model using Structural Similarity between generated images and test samples and ELBO ssim = calculate_ssim(predictions, test_dataset) print("> " + str(epoch) + ": SSIM=" + str(ssim) + ', ELBO=' + str(elbo)) # add the evaluatons to a list and plot the results later ssims.append(ssim) elbos.append(elbo) # return the trained model return model, elbos, ssims
def train(self, train_dataset, valid_dataset, steps, evaluate_every=1000, save_best_only=False): loss_mean = Mean() ckpt_mgr = self.checkpoint_manager ckpt = self.checkpoint self.now = time.perf_counter() for lr, hr in train_dataset.take(steps - ckpt.step.numpy()): ckpt.step.assign_add(1) step = ckpt.step.numpy() loss = self.train_step(lr, hr) loss_mean(loss) print("Currently in the train step ",step) if step % evaluate_every == 0: loss_value = loss_mean.result() loss_mean.reset_states() # Compute PSNR on validation dataset psnr_value = self.evaluate(valid_dataset) duration = time.perf_counter() - self.now print(f'{step}/{steps}: loss = {loss_value.numpy():.3f}, PSNR = {psnr_value.numpy():3f} ({duration:.2f}s)') if save_best_only and psnr_value <= ckpt.psnr: self.now = time.perf_counter() continue ckpt.psnr = psnr_value ckpt_mgr.save() self.now = time.perf_counter()
def pre_train(generator, train_dataset, valid_dataset, steps, evaluate_every=1,lr_rate=1e-4): loss_mean = Mean() pre_train_loss = MeanSquaredError() pre_train_optimizer = Adam(lr_rate) now = time.perf_counter() step = 0 for lr, hr in train_dataset.take(steps): step = step+1 with tf.GradientTape() as tape: lr = tf.cast(lr, tf.float32) hr = tf.cast(hr, tf.float32) sr = generator(lr, training=True) loss_value = pre_train_loss(hr, sr) gradients = tape.gradient(loss_value, generator.trainable_variables) pre_train_optimizer.apply_gradients(zip(gradients, generator.trainable_variables)) loss_mean(loss_value) if step % evaluate_every == 0: loss_value = loss_mean.result() loss_mean.reset_states() psnr_value = evaluate(generator, valid_dataset) duration = time.perf_counter() - now print( f'{step}/{steps}: loss = {loss_value.numpy():.3f}, PSNR = {psnr_value.numpy():3f} ({duration:.2f}s)') now = time.perf_counter()
def train(self, data_generator, epochs=10, checkpoint_dir='./training_checkpoints'): # create checkpoint to save the training progression checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt") checkpoint = tf.train.Checkpoint( generator_optimizer=self.optimizer_g, discriminator_optimizer=self.optimizer_d, generator=self.generator, discriminator=self.discriminator) fixed_noise = get_noise(25) #print("Base noise:") #fake_images = generator(fixed_noise, training=False).numpy() #jupy_display(display(fake_images)) # loop over epochs : for epoch in range(epochs): start = time.time() print("====== Epoch {:2d} ======".format(epoch)) #initiaite the mean loss over the epoch for the discriminator and generator epoch_loss_d = Mean() epoch_loss_g = Mean() #epoch_len = tf.data.experimental.cardinality(data_generator) for i, real_images in enumerate(data_generator): loss_d, loss_g = self.train_step(real_images) epoch_loss_d(loss_d) epoch_loss_g(loss_g) print("\nDiscriminator: {}, Generator: {}".format( epoch_loss_d.result(), epoch_loss_g.result())) print('Time for epoch {} is {} sec'.format(epoch + 1, time.time() - start)) if (epoch + 1) % 5 == 0: checkpoint.save(file_prefix=checkpoint_prefix) fake_images = self.generator(fixed_noise, training=False) self.plot_generated_images(fake_images, epoch, save=True)
def train(self, train_dataset, valid_dataset, save_best_only=False): loss_mean = Mean() ckpt_mgr = self.checkpoint_manager ckpt = self.checkpoint self.now = time.perf_counter() for lr, hr in train_dataset.take(self.args.num_iter - ckpt.step.numpy()): ckpt.step.assign_add(1) step = ckpt.step.numpy() loss = self.train_step(lr, hr) loss_mean(loss) loss_value = loss_mean.result() loss_mean.reset_states() lr_value = ckpt.optimizer._decayed_lr('float32').numpy() duration = time.perf_counter() - self.now self.now = time.perf_counter() if step % self.args.log_freq == 0: tf.summary.scalar('loss', loss_value, step=step) tf.summary.scalar('lr', lr_value, step=step) if step % self.args.print_freq == 0: print( f'{step}/{self.args.num_iter}: loss = {loss_value.numpy():.3f} , lr = {lr_value:.6f} ({duration:.2f}s)' ) if step % self.args.valid_freq == 0: psnr_value = self.evaluate(valid_dataset) ckpt.psnr = psnr_value tf.summary.scalar('psnr', psnr_value, step=step) print( f'{step}/{self.args.num_iter}: loss = {loss_value.numpy():.3f}, lr = {lr_value:.6f}, PSNR = {psnr_value.numpy():3f}' ) if step % self.args.save_freq == 0: # save weights only save_path = self.ckpt_path + '/weights-' + str(step) + '.h5' self.checkpoint.model.save_weights(filepath=save_path, save_format='h5') # save ckpt (weights + other train status) ckpt_mgr.save(checkpoint_number=step)
class StandardVarianceBasedMetric(Metric): def __init__(self, name, dtype): super().__init__(name, dtype=dtype) self._mean = Mean(dtype=dtype) self._square_mean = Mean(dtype=dtype) @abstractmethod def _objective_function(self, y_true, y_pred): pass def update_state(self, y_true, y_pred, sample_weight=None): values = self._objective_function(y_true, y_pred) self._mean.update_state(values=values, sample_weight=sample_weight) self._square_mean.update_state(values=tf.square(values), sample_weight=sample_weight) def result(self): return tf.sqrt(self._square_mean.result() - tf.square(self._mean.result())) def reset_states(self): self._mean.reset_states() self._square_mean.reset_states()
def gain_train_step(dataset, gain, n_epochs): generator, discriminator = gain.layers discriminator_optimizer = keras.optimizers.SGD(momentum=0.9, nesterov=True) generator_optimizer = keras.optimizers.Adam() # Keep results for plotting train_discriminator_loss_results = [] train_generator_loss_results = [] for epoch in range(n_epochs): epoch_discriminator_loss_avg = Mean() epoch_generator_loss_avg = Mean() for x_batch, mask_batch in dataset: x_batch = cast(x_batch, float32) mask_batch = cast(mask_batch, float32) # phase 1: train discriminator hint = hint_generator(x_batch, mask_batch) generated_samples = generator(concat( [hint, mask_batch], axis = 1)) discriminator.trainable = True discriminator_loss_value, discriminator_grads = gain_grad(discriminator, generated_samples, mask_batch) discriminator_optimizer.apply_gradients(zip(discriminator_grads, discriminator.trainable_variables)) # phase 2 - training the generator hint = hint_generator(x_batch, mask_batch) discriminator.trainable = False generator_loss_value, generator_grads = gain_grad(gain, concat( [hint, mask_batch], axis = 1), mask_batch) generator_optimizer.apply_gradients(zip(generator_grads, gain.trainable_variables)) # Track progress: Add current batch loss epoch_discriminator_loss_avg.update_state(discriminator_loss_value) epoch_generator_loss_avg.update_state(generator_loss_value) # End epoch train_discriminator_loss_results.append(epoch_discriminator_loss_avg.result()) train_generator_loss_results.append(epoch_generator_loss_avg.result()) if epoch % 50 == 0: print("GAIN Epoch {:03d}: Discriminator Loss: {:.3f}".format(epoch, epoch_discriminator_loss_avg.result() ) , file=sys.stdout) print("GAIN Epoch {:03d}: Generator Loss: {:.3f}".format(epoch, epoch_generator_loss_avg.result() ) , file=sys.stdout) return gain, train_discriminator_loss_results, train_generator_loss_results
class Leaner: def __init__(self, config: MuZeroConfig, storage: SharedStorage, replay_buffer: ReplayBuffer): self.config = config self.storage = storage self.replay_buffer = replay_buffer self.summary = create_summary(name="leaner") self.metrics_loss = Mean(f'leaner-loss', dtype=tf.float32) self.network = Network(self.config) self.lr_schedule = ExponentialDecay( initial_learning_rate=self.config.lr_init, decay_steps=self.config.lr_decay_steps, decay_rate=self.config.lr_decay_rate) self.optimizer = Adam(learning_rate=self.lr_schedule) def start(self): while self.network.training_steps() < self.config.training_steps: if ray.get(self.replay_buffer.size.remote()) > 0: self.train() if self.network.training_steps( ) % self.config.checkpoint_interval == 0: weigths = self.network.get_weights() self.storage.update_network.remote(weigths) if self.network.training_steps( ) % self.config.save_interval == 0: self.network.save() print("Finished") def train(self): batch = ray.get(self.replay_buffer.sample_batch.remote()) with tf.GradientTape() as tape: loss = self.network.loss_function(batch) grads = tape.gradient(loss, self.network.get_variables()) self.optimizer.apply_gradients(zip(grads, self.network.get_variables())) self.metrics_loss(loss) with self.summary.as_default(): tf.summary.scalar(f'loss', self.metrics_loss.result(), self.network.training_steps()) self.metrics_loss.reset_states() self.network.update_training_steps()
def train(self, train_dataset, valid_dataset, steps, evaluate_every=1000, save_best_only=False): loss_mean = Mean() ckpt_mgr = self.checkpoint_manager ckpt = self.checkpoint self.now = time.perf_counter() for lr, hr in train_dataset.take(steps - ckpt.step.numpy( )): # for low_resolution+high_resolution image pair in dataset t_start = time.time() ckpt.step.assign_add(1) step = ckpt.step.numpy() loss = self.train_step(lr, hr) loss_mean(loss) t_end = time.time() print("epoch:%3d step:%2d loss:%.5f time:%.3f" % (step / 50, step % 50, loss, t_end - t_start)) # evaluate if step % evaluate_every == 0: loss_value = loss_mean.result() loss_mean.reset_states() # Compute PSNR on validation dataset psnr_value = self.evaluate(valid_dataset) duration = time.perf_counter() - self.now print( f'{step}/{steps}: loss = {loss_value.numpy():.3f}, PSNR = {psnr_value.numpy():3f} ({duration:.2f}s)' ) if save_best_only and psnr_value <= ckpt.psnr: # if no PSNR improvement self.now = time.perf_counter() # skip saving checkpoint continue ckpt.psnr = psnr_value ckpt_mgr.save() print("checkpoint saved!") self.now = time.perf_counter()
def train(self, train_dataset, valid_dataset, steps, evaluate_every=1000, save_best_only=False): loss_mean = Mean() ckpt_mgr = self.checkpoint_manager ckpt = self.checkpoint vis_list = [] for lr, hr in train_dataset.take(steps - ckpt.step.numpy()): ckpt.step.assign_add(1) step = ckpt.step.numpy() loss = self.train_step(lr, hr) loss_mean(loss) if step % evaluate_every == 0: loss_value = loss_mean.result() loss_mean.reset_states() # Compute PSNR on validation dataset psnr_value = self.evaluate(valid_dataset) print( f'{step}/{steps}: loss = {loss_value.numpy():.3f}, PSNR = {psnr_value.numpy():3f}' ) vis_list.append((step, loss_value, psnr_value)) if save_best_only and psnr_value <= ckpt.psnr: # skip saving checkpoint, no PSNR improvement continue ckpt.psnr = psnr_value ckpt_mgr.save() # saving progress data to make graphs csv = open('./visLoss.csv', 'w') csv.write('step, loss, psnr\n') for vals in vis_list: csv.write('{},{},{}\n'.format(vals[0], vals[1], vals[2])) csv.close()
def train(self, train_dataset, valid_dataset, steps, evaluate_every=1000, save_best_only=False): loss_mean = Mean() ckpt_mgr = self.checkpoint_manager ckpt = self.checkpoint self.now = time.perf_counter() for lr, hr in train_dataset.take(steps - ckpt.step.numpy()): #print('check1..', steps, ckpt.step.numpy()) ckpt.step.assign_add(1) step = ckpt.step.numpy() loss = self.train_step(lr, hr) loss_mean(loss) if step % evaluate_every == 0: loss_value = loss_mean.result() loss_mean.reset_states() # Compute PSNR on validation dataset psnr_value = self.evaluate(valid_dataset) duration = time.perf_counter() - self.now print( f'{step}/{steps}: loss = {loss_value.numpy():.3f}, PSNR = {psnr_value.numpy():3f} ({duration:.2f}s)' ) ######### self.resolve_and_plot('demo/img_0', step) ######### if save_best_only and psnr_value <= ckpt.psnr: self.now = time.perf_counter() # skip saving checkpoint, no PSNR improvement continue ckpt.psnr = psnr_value ckpt_mgr.save() self.now = time.perf_counter()
class MeanBasedMetric(Metric): def __init__(self, name, dtype): super().__init__(name, dtype=dtype) self._mean = Mean(dtype=dtype) @abstractmethod def _objective_function(self, y_true, y_pred): pass def update_state(self, y_true, y_pred, sample_weight=None): values = self._objective_function(y_true, y_pred) self._mean.update_state(values=values, sample_weight=sample_weight) def result(self): return self._mean.result() def reset_states(self): self._mean.reset_states()
def train_generator(self, train_dataset, valid_dataset, epochs=20000, valid_lr=None, valid_hr=None): evaluate_size = epochs / 10 loss_mean = Mean() start_time = time.time() epoch = 0 for lr, hr in train_dataset.take(epochs): epoch += 1 step = tf.convert_to_tensor(epoch, dtype=tf.int64) generator_loss = self.train_generator_step(lr, hr) loss_mean(generator_loss) if epoch % 50 == 0: loss_value = loss_mean.result() loss_mean.reset_states() psnr_value = self.evaluate(valid_dataset.take(1)) print( f'Time for epoch {epoch}/{epochs} is {(time.time() - start_time):.4f} sec, ' f'gan loss = {loss_value:.4f}, psnr = {psnr_value:.4f}') start_time = time.time() if self.summary_writer is not None: with self.summary_writer.as_default(): tf.summary.scalar('generator_loss', loss_value, step=epoch) tf.summary.scalar('psnr', psnr_value, step=epoch) if epoch % evaluate_size == 0: self.util.save_checkpoint(self.checkpoint, epoch) if epoch % 5000 == 0: self.generate_and_save_images(step, valid_lr, valid_hr)
def train(self, train_dataset, valid_dataset, steps, evaluate_every=1000, save_best_only=False): loss_mean = Mean() ckpt_mgr = self.checkpoint_manager ckpt = self.checkpoint self.now = timeit.default_timer() for lr, hr in train_dataset.take(steps - ckpt.step.numpy()): ckpt.step.assign_add(1) step = ckpt.step.numpy() loss = self.train_step(lr, hr) loss_mean(loss) if step % evaluate_every == 0: loss_value = loss_mean.result() loss_mean.reset_states() # Compute PSNR on validation dataset psnr_value, ssim_value = self.evaluate(valid_dataset) duration = timeit.default_timer() - self.now print('%d/%d: loss = %.3f, PSNR = %3f (%.2fs)' % (step, steps, loss_value.numpy(), psnr_value.numpy(), duration)) if save_best_only and psnr_value <= ckpt.psnr: self.now = timeit.timeit() # skip saving checkpoint, no PSNR improvement continue ckpt.psnr = psnr_value ckpt_mgr.save() self.now = timeit.timeit()
def train(self, train_ds, valid_ds, steps, evaluate_every=1000, save_best_only=False): loss_mean = Mean() ckpt_mgr = self.checkpoint_manager ckpt = self.checkpoint self.now = time.perf_counter() for lr, hr in train_ds.take(steps - ckpt.step.numpy()): ckpt.step.assign_add(1) step = ckpt.step.numpy() loss = self.train_step(lr, hr) loss_mean(loss) if step % evaluate_every == 0: # Record loss value loss_value = loss_mean.result() loss_mean.reset_states() # Comput PSNR on validation set psnr_value = self.evaluate(valid_ds) # Calculate time consumed duration = time.perf_counter() - self.now print('{}/{}: loss = {:.3f}, PSNR = {:.3f} ({:.2f}s)'.format(step, steps, loss_value.numpy(), psnr_value.numpy(), duration)) # Skip checkpoint if PSNR does not improve if save_best_only and psnr_value <= ckpt.psnr: self.now = time.perf_counter() continue # Save checkpoint ckpt.psnr = psnr_value ckpt_mgr.save() self.now = time.perf_counter()
class ModelTrainer: """ Note: Having this model keeps the trainStep and testStep instance new every time you call it. Implementing those functions outside a class will return an error ValueError: Creating variables on a non-first call to a function decorated with tf.function. """ def __init__(self, model, loss, metric, optimizer, ckptDir, logDir, multiGPU=True, evalStep=1000): # Safety checks self.logDirTrain = os.path.join(logDir, 'Train') self.logDirTest = os.path.join(logDir, 'Test') if not os.path.exists(ckptDir): os.makedirs(ckptDir) if not os.path.exists(self.logDirTrain): os.makedirs(self.logDirTrain) if not os.path.exists(self.logDirTest): os.makedirs(self.logDirTest) self.trainWriter = tf.summary.create_file_writer(self.logDirTrain) self.testWriter = tf.summary.create_file_writer(self.logDirTest) self.ckpt = tf.train.Checkpoint(step=tf.Variable(0), psnr=tf.Variable(1.0), optimizer=optimizer, model=model) self.ckptMngr = tf.train.CheckpointManager(checkpoint=self.ckpt, directory=ckptDir, max_to_keep=5) self.loss = loss self.metric = metric self.accTestLoss = Mean(name='accTestLoss') self.accTestPSNR = Mean(name='accTestPSNR') self.accTrainLoss = Mean(name='accTrainLoss') self.accTrainPSNR = Mean(name='accTrainPSNR') self.evalStep = evalStep self.multiGPU = multiGPU self.strategy = None self.restore() @property def model(self): return self.ckpt.model def restore(self): if self.ckptMngr.latest_checkpoint: self.ckpt.restore(self.ckptMngr.latest_checkpoint) print( f'[ INFO ] Model restored from checkpoint at step {self.ckpt.step.numpy()}.' ) def fitTrainData(self, X: tf.Tensor, y: tf.Tensor, globalBatchSize: int, epochs: int, valData: List[np.ma.array], bufferSize: int = 128, valSteps: int = 64, saveBestOnly: bool = True, initEpoch: int = 0): logger.info('[ INFO ] Loading data set to buffer cache...') trainSet = loadTrainDataAsTFDataSet(X, y[0], y[1], epochs, globalBatchSize, bufferSize) valSet = loadValDataAsTFDataSet(valData[0], valData[1], valData[2], valSteps, globalBatchSize, bufferSize) logger.info('[ INFO ] Loading success...') dataSetLength = len(X) totalSteps = tf.cast(dataSetLength / globalBatchSize, tf.int64) globalStep = tf.cast(self.ckpt.step, tf.int64) step = globalStep % totalSteps epoch = initEpoch logger.info('[ INFO ] Begin training...') for x_batch_train, y_batch_train, y_mask_batch_train in trainSet: if (totalSteps - step) == 0: epoch += 1 step = tf.cast(self.ckpt.step, tf.int64) % totalSteps logger.info( f'[ *************** NEW EPOCH *************** ] Epoch number {epoch}' ) # Reset metrics self.accTrainLoss.reset_states() self.accTrainPSNR.reset_states() self.accTestLoss.reset_states() self.accTestPSNR.reset_states() step += 1 globalStep += 1 self.trainStep(x_batch_train, y_batch_train, y_mask_batch_train) self.ckpt.step.assign_add(1) t = f"[ EPOCH {epoch}/{epochs} ] - [ STEP {step}/{int(totalSteps)} ] Loss: {self.accTrainLoss.result():.3f}, cPSNR: {self.accTrainPSNR.result():.3f}" logger.info(t) self.saveLog('Train', globalStep) if step != 0 and (step % self.evalStep) == 0: # Reset states for test self.accTestLoss.reset_states() self.accTestPSNR.reset_states() for x_batch_val, y_batch_val, y_mask_batch_val in valSet: self.testStep(x_batch_val, y_batch_val, y_mask_batch_val) self.saveLog('Test', globalStep) t = f"[ *************** VAL INFO *************** ] Validation Loss: {self.accTestLoss.result():.3f}, Validation PSNR: {self.accTestPSNR.result():.3f}" logger.info(t) if saveBestOnly and (self.accTestPSNR.result() <= self.ckpt.psnr): continue logger.info('[ SAVE ] Saving checkpoint...') self.ckpt.psnr = self.accTestPSNR.result() self.ckptMngr.save() @tf.function def trainStep(self, patchLR, patchHR, maskHR): with tf.GradientTape() as tape: predPatchHR = self.ckpt.model(patchLR, training=True) # Loss(patchHR: tf.Tensor, maskHR: tf.Tensor, predPatchHR: tf.Tensor) loss = self.loss(patchHR, maskHR, predPatchHR) gradients = tape.gradient(loss, self.ckpt.model.trainable_variables) self.ckpt.optimizer.apply_gradients( zip(gradients, self.ckpt.model.trainable_variables)) metric = self.metric(patchHR, maskHR, predPatchHR) self.accTrainLoss(loss) self.accTrainPSNR(metric) @tf.function def testStep(self, patchLR, patchHR, maskHR): predPatchHR = self.ckpt.model(patchLR, training=False) loss = self.loss(patchHR, maskHR, predPatchHR) metric = self.metric(patchHR, maskHR, predPatchHR) self.accTestLoss(loss) self.accTestPSNR(metric) def saveLog(self, testOrTrain, globalStep): w = self.trainWriter if testOrTrain == 'Train' else self.testWriter with w.as_default(): if testOrTrain == 'Train': tf.summary.scalar('PSNR', self.accTrainPSNR.result(), step=globalStep) tf.summary.scalar('Loss', self.accTrainLoss.result(), step=globalStep) else: tf.summary.scalar('PSNR', self.accTestPSNR.result(), step=globalStep) tf.summary.scalar('Loss', self.accTestLoss.result(), step=globalStep) w.flush()
loss_value, grads = training.grad(x_train_, y_train_) optimizer.apply_gradients(zip(grads, teacher_model.trainable_weights)) loss_value_test = training.loss(x_val_, y_val_) probs = tf.nn.softmax(teacher_model(x_train_)) probs_val = tf.nn.softmax(teacher_model(x_val_)) loss_metric(loss_value) acc_metric(acc(y_train_, probs)) loss_metric_val(loss_value_test) acc_metric_val(acc_val(y_val_, probs_val)) # 学習進捗の表示 print( 'Epoch {}/{}: Loss: {:.3f}, Accuracy: {:.3%}, Validation Loss: {:.3f}, Validation Accuracy: {:.3%}' .format(epoch, EPOCHS_T, loss_metric.result().numpy(), acc_metric.result().numpy(), loss_metric_val.result().numpy(), acc_metric_val.result().numpy())) # LossとAccuracyの記録(後でグラフにプロットするため) history_teacher.losses.append(loss_metric.result().numpy()) history_teacher.accuracy.append(acc_metric.result().numpy() * 100) history_teacher.losses_val.append(loss_metric_val.result().numpy()) history_teacher.accuracy_val.append(acc_metric_val.result().numpy() * 100) # バッチサイズ変更 ds_train = tf.data.Dataset.from_tensor_slices( (x_train, y_train)).shuffle(x_train.shape[0]).batch(BATCH_SIZE_S) ds_val = tf.data.Dataset.from_tensor_slices( (x_val, y_val)).shuffle(x_val.shape[0]).batch(BATCH_SIZE_S) ds_test = tf.data.Dataset.from_tensor_slices(
class ModelTrainer: """ Note: Having this model keeps the trainStep and testStep instance new every time you call it. Implementing those functions outside a class will return an error ValueError: Creating variables on a non-first call to a function decorated with tf.function. """ def __init__(self, model, loss, metric, optimizer, ckptDir, logDir, strategy, multiGPU=True, evalStep=10): # Safety checks if not os.path.exists(ckptDir): os.makedirs(ckptDir) if not os.path.exists(logDir): os.makedirs(logDir) self.ckpt = tf.train.Checkpoint(step=tf.Variable(0), psnr=tf.Variable(1.0), optimizer=optimizer, model=model) self.ckptMngr = tf.train.CheckpointManager(checkpoint=self.ckpt, directory=ckptDir, max_to_keep=5) self.loss = loss self.metric = metric self.logDir = logDir self.trainLoss = Mean(name='trainLoss') self.trainPSNR = Mean(name='trainPSNR') self.testLoss = Mean(name='testLoss') self.testPSNR = Mean(name='testPSNR') self.evalStep = evalStep self.multiGPU = multiGPU self.strategy = strategy self.restore() @property def model(self): return self.ckpt.model def restore(self): if self.ckptMngr.latest_checkpoint: self.ckpt.restore(self.ckptMngr.latest_checkpoint) print( f'[ INFO ] Model restored from checkpoint at step {self.ckpt.step.numpy()}.' ) def fitTrainData(self, X: tf.Tensor, y: tf.Tensor, batchSize: int, epochs: int, valData: List[np.ma.array], bufferSize: int = 256, valSteps: int = 128, saveBestOnly: bool = True, initEpoch: int = 0): if self.multiGPU: logger.info('[ INFO ] Multi-GPU mode selected...') logger.info('[ INFO ] Instantiate strategy...') batchSizePerReplica = batchSize globalBatchSize = batchSizePerReplica * self.strategy.num_replicas_in_sync else: globalBatchSize = batchSize logger.info('[ INFO ] Loading data set to buffer cache...') trainSet = loadTrainDataAsTFDataSet(X, y[0], y[1], epochs, globalBatchSize, bufferSize) valSet = loadValDataAsTFDataSet(valData[0], valData[1], valData[2], valSteps, globalBatchSize, bufferSize) logger.info('[ INFO ] Loading success...') if self.multiGPU: logger.info('[ INFO ] Distributing train set...') trainSet = self.strategy.experimental_distribute_dataset(trainSet) logger.info('[ INFO ] Distributing test set...') valSet = self.strategy.experimental_distribute_dataset(valSet) w = tf.summary.create_file_writer(self.logDir) dataSetLength = len(X) totalSteps = tf.cast(dataSetLength / globalBatchSize, tf.int64) globalStep = tf.cast(self.ckpt.step, tf.int64) step = globalStep % totalSteps epoch = initEpoch logger.info('[ INFO ] Begin training...') with w.as_default(): for x_batch_train, y_batch_train, y_mask_batch_train in trainSet: if (totalSteps - step) == 0: epoch += 1 step = tf.cast(self.ckpt.step, tf.int64) % totalSteps logger.info(f'[ NEW EPOCH ] Epoch number {epoch}') # Reset metrics self.trainLoss.reset_states() self.trainPSNR.reset_states() self.testLoss.reset_states() self.testPSNR.reset_states() step += 1 globalStep += 1 self.trainDistStep(x_batch_train, y_batch_train, y_mask_batch_train) self.ckpt.step.assign_add(1) t = f"[ EPOCH {epoch}/{epochs} ] Step {step}/{int(totalSteps)}, Loss: {self.trainLoss.result():.3f}, cPSNR: {self.trainPSNR.result():.3f}" logger.info(t) tf.summary.scalar('Train PSNR', self.trainPSNR.result(), step=globalStep) tf.summary.scalar('Train loss', self.trainLoss.result(), step=globalStep) if step != 0 and (step % self.evalStep) == 0: # Reset states for test self.testLoss.reset_states() self.testPSNR.reset_states() for x_batch_val, y_batch_val, y_mask_batch_val in valSet: self.testDistStep(x_batch_val, y_batch_val, y_mask_batch_val) tf.summary.scalar('Test loss', self.testLoss.result(), step=globalStep) tf.summary.scalar('Test PSNR', self.testPSNR.result(), step=globalStep) t = f"[ VAL INFO ] Validation Loss: {self.testLoss.result():.3f}, Validation PSNR: {self.testPSNR.result():.3f}" logger.info(t) w.flush() if saveBestOnly and (self.testPSNR.result() <= self.ckpt.psnr): continue logger.info('[ SAVE ] Saving checkpoint...') self.ckpt.psnr = self.testPSNR.result() self.ckptMngr.save() def computeLoss(self, patchHR, maskHR, predPatchHR): loss = tf.reduce_sum(self.loss(patchHR, maskHR, predPatchHR)) * (1.0 / self.batchSize) loss += (sum(self.ckpt.model.losses) * 1.0 / self.strategy.num_replicas_in_sync) return loss def calcMetric(self, patchHR, maskHR, predPatchHR): return self.metric(patchHR, maskHR, predPatchHR) @tf.function def trainStep(self, patchLR, patchHR, maskHR): with tf.GradientTape() as tape: predPatchHR = self.ckpt.model(patchLR, training=True) # Loss(patchHR: tf.Tensor, maskHR: tf.Tensor, predPatchHR: tf.Tensor) loss = self.loss(patchHR, maskHR, predPatchHR) gradients = tape.gradient(loss, self.ckpt.model.trainable_variables) self.ckpt.optimizer.apply_gradients( zip(gradients, self.ckpt.model.trainable_variables)) return loss @tf.function def testStep(self, patchLR, patchHR, maskHR): predPatchHR = self.ckpt.model(patchLR, training=False) loss = self.loss(patchHR, maskHR, predPatchHR) return loss @tf.function def trainDistStep(self, patchLR, patchHR, maskHR): perExampleLosses = self.strategy.experimental_run_v2(self.trainStep, args=(patchLR, patchHR, maskHR)) perExampleMetric = self.strategy.experimental_run_v2(self.calcMetric, args=(patchLR, patchHR, maskHR)) meanLoss = self.strategy.reduce(tf.distribute.ReduceOp.MEAN, perExampleLosses, axis=0) meanMetric = self.strategy.reduce(tf.distribute.ReduceOp.MEAN, perExampleMetric, axis=0) self.trainLoss(meanLoss) self.trainPSNR(meanMetric) @tf.function def testDistStep(self, patchLR, patchHR, maskHR): perExampleLosses = self.strategy.experimental_run_v2(self.testStep, args=(patchLR, patchHR, maskHR)) perExampleMetric = self.strategy.experimental_run_v2(self.calcMetric, args=(patchLR, patchHR, maskHR)) meanLoss = self.strategy.reduce(tf.distribute.ReduceOp.MEAN, perExampleLosses, axis=0) meanMetric = self.strategy.reduce(tf.distribute.ReduceOp.MEAN, perExampleMetric, axis=0) self.testLoss(meanLoss) self.testPSNR(meanMetric)
# Extra line of printout because the ProgBar would overwrite the logs in the terminal print("Epoch: {}/{}".format(epoch + 1, EPOCHS)) print("Epoch: {}/{}".format(epoch + 1, EPOCHS)) start = time.time() # Iterate over the batches of the dataset. for step, x_batch in enumerate(ds_train): with tf.GradientTape() as tape: loss = vae(image=x_batch, return_recon_loss=True) grads = tape.gradient(loss, vae.trainable_weights) optimizer.apply_gradients([ (grad, var) for (grad, var) in zip(grads, vae.trainable_variables) if grad is not None ]) loss_metric(loss) progress_bar.update(step) end = time.time() time_per_step = (end - start) * 1000 / steps print(" - {:.3f}ms/step - loss: {:.6f}".format(time_per_step, loss_metric.result())) if (epoch + 1) % 5 == 0 and epoch != 0: vae.save_weights("./dalle_tensorflow/model_weights/vae/vae_weights" + "_" + str(epoch + 1)) # Save the model weights (subclassed model cannot use save_model) vae.save_weights("./dalle_tensorflow/model_weights/vae/vae_weights")
def train_gan(self, train_dataset, valid_dataset, epochs=200000, valid_lr=None, valid_hr=None): evaluate_size = epochs / 10 start = time.time() vgg_metric = Mean() dls_metric = Mean() g_metric = Mean() c_metric = Mean() epoch = 0 for lr, hr in train_dataset.take(epochs): epoch += 1 step = tf.convert_to_tensor(epoch, tf.int64) vgg_loss, discremenator_loss, generator_loss, content_loss = self.train_gan_step( lr, hr) vgg_metric(vgg_loss) dls_metric(discremenator_loss) g_metric(generator_loss) c_metric(content_loss) if epoch % 50 == 0: vgg = vgg_metric.result() discriminator_loss_metric = dls_metric.result() generator_loss_metric = g_metric.result() content_loss_metric = c_metric.result() vgg_metric.reset_states() dls_metric.reset_states() g_metric.reset_states() c_metric.reset_states() psnr_value = self.evaluate(valid_dataset.take(1)) print( f'Time for epoch {epoch}/{epochs} is {(time.time() - start):.4f} sec, ' f' perceptual loss = {vgg:.4f},' f' generator loss = {generator_loss_metric:.4f},' f' discriminator loss = {discriminator_loss_metric:.4f},' f' content loss = {content_loss_metric:.4f},' f' psnr = {psnr_value:.4f}') start = time.time() if self.summary_writer is not None: with self.summary_writer.as_default(): tf.summary.scalar('generator_loss', generator_loss_metric, step=epoch) tf.summary.scalar('content loss', content_loss_metric, step=epoch) tf.summary.scalar( 'vgg loss = content loss + 0.0001 * gan loss', vgg, step=epoch) tf.summary.scalar('discremenator_loss', discriminator_loss_metric, step=epoch) tf.summary.scalar('psnr', psnr_value, step=epoch) if epoch % evaluate_size == 0: self.util.save_checkpoint(self.checkpoint, epoch) if epoch % 5000 == 0: self.generate_and_save_images(step, valid_lr, valid_hr)
@tf.function def testing(images, labels): predicts = model(images) t_loss = loss_(labels, predicts) test_loss(t_loss) test_accuracy(labels, predicts) # TRAINING for epoch in range(EPOCHS): for train_images, train_labels in train: training(train_images, train_labels) for test_images, test_labels in test: testing(test_images, test_labels) to_print = 'Epoch {}, Loss: {}, Accuracy: {}, Test Loss: {}, Test Accuracy: {}' print( to_print.format(epoch + 1, train_loss.result(), train_accuracy.result() * 100, test_loss.result(), test_accuracy.result() * 100)) # Reset the metrics for the next epoch train_loss.reset_states() train_accuracy.reset_states() test_loss.reset_states() test_accuracy.reset_states() model.save_weights('model', save_format='tf')
class MNIST2MNIST_M_DANN(object): def __init__(self,config): """ 这是MNINST与MNIST_M域适配网络的初始化函数 :param config: 参数配置类 """ # 初始化参数类 self.cfg = config # 定义相关占位符 self.grl_lambd = 1.0 # GRL层参数 # 搭建深度域适配网络 self.build_DANN() # 定义训练和验证损失与指标 self.loss = categorical_crossentropy self.acc = categorical_accuracy self.train_loss = Mean("train_loss", dtype=tf.float32) self.train_image_cls_loss = Mean("train_image_cls_loss", dtype=tf.float32) self.train_domain_cls_loss = Mean("train_domain_cls_loss", dtype=tf.float32) self.train_image_cls_acc = Mean("train_image_cls_acc", dtype=tf.float32) self.train_domain_cls_acc = Mean("train_domain_cls_acc", dtype=tf.float32) self.val_loss = Mean("val_loss", dtype=tf.float32) self.val_image_cls_loss = Mean("val_image_cls_loss", dtype=tf.float32) self.val_domain_cls_loss = Mean("val_domain_cls_loss", dtype=tf.float32) self.val_image_cls_acc = Mean("val_image_cls_acc", dtype=tf.float32) self.val_domain_cls_acc = Mean("val_domain_cls_acc", dtype=tf.float32) # 定义优化器 self.optimizer = tf.keras.optimizers.SGD(self.cfg.init_learning_rate, momentum=self.cfg.momentum_rate) ''' # 初始化早停策略 self.early_stopping = EarlyStopping(min_delta=1e-5, patience=100, verbose=1) ''' def build_DANN(self): """ 这是搭建域适配网络的函数 :return: """ # 定义源域、目标域的图像输入和DANN模型图像输入 self.image_input = Input(shape=self.cfg.image_input_shape,name="image_input") # 域分类器与图像分类器的共享特征 self.feature_encoder = build_feature_extractor() # 获取图像分类结果和域分类结果张量 self.image_cls_encoder = build_image_classify_extractor() self.domain_cls_encoder = build_domain_classify_extractor() self.grl = GradientReversalLayer() self.dann_model = Model(self.image_input, [self.image_cls_encoder(self.feature_encoder(self.image_input)), self.domain_cls_encoder(self.grl(self.feature_encoder(self.image_input)))]) self.dann_model.summary() # 导入 if self.cfg.pre_model_path is not None: self.dann_model.load_weights(self.cfg.pre_model_path,by_name=True,skip_mismatch=True) def train(self,train_source_datagen,train_target_datagen, val_target_datagen,train_iter_num,val_iter_num): """ 这是DANN的训练函数 :param train_source_datagen: 源域训练数据集生成器 :param train_target_datagen: 目标域训练数据集生成器 :param val_datagen: 验证数据集生成器 :param train_iter_num: 每个epoch的训练次数 :param val_iter_num: 每次验证过程的验证次数 """ # 初始化相关文件目录路径 time = datetime.datetime.now().strftime("%Y%m%d%H%M%S") checkpoint_dir = os.path.join(self.cfg.checkpoints_dir,time) if not os.path.exists(checkpoint_dir): os.mkdir(checkpoint_dir) log_dir = os.path.join(self.cfg.logs_dir, time) if not os.path.exists(log_dir): os.mkdir(log_dir) self.cfg.save_config(time) self.writer_hyperparameter = tf.summary.create_file_writer(os.path.join(log_dir,"hyperparameter")) self.writer_train = tf.summary.create_file_writer(os.path.join(log_dir,"train")) self.writer_val = tf.summary.create_file_writer(os.path.join(log_dir,'validation')) print('\n----------- start to train -----------\n') with open(os.path.join(log_dir,'log.txt'),'w') as f: for ep in np.arange(1,self.cfg.epoch+1,1): # 初始化精度条 self.progbar = Progbar(train_iter_num+1) print('Epoch {}/{}'.format(ep, self.cfg.epoch)) # 进行一个周期的模型训练 train_loss,train_image_cls_acc = self.train_one_epoch\ (train_source_datagen,train_target_datagen,train_iter_num,ep) # 进行一个周期的模型验证 val_loss,val_image_cls_acc = self.eval_one_epoch(val_target_datagen,val_iter_num,ep) # 更新进度条 self.progbar.update(train_iter_num+1, [('val_loss', val_loss), ("val_image_acc", val_image_cls_acc)]) # 损失和指标清零 self.train_loss.reset_states() self.train_image_cls_acc.reset_states() self.train_domain_cls_loss.reset_states() self.train_image_cls_acc.reset_states() self.train_domain_cls_acc.reset_states() self.val_loss.reset_states() self.val_image_cls_acc.reset_states() self.val_domain_cls_loss.reset_states() self.val_image_cls_acc.reset_states() self.val_domain_cls_acc.reset_states() # 保存训练过程中的模型 str = "Epoch{:03d}-train_loss-{:.3f}-val_loss-{:.3f}-train_imgae_cls_acc-{:.3f}-val_image_cls_acc-{:.3f}"\ .format(ep, train_loss, val_loss,train_image_cls_acc,val_image_cls_acc) print(str) f.write(str+"\n") # 写入日志文件 self.dann_model.save(os.path.join(checkpoint_dir, str + ".h5")) ''' # 判断是否需要早停模型训练过程,判断指标为目标域的图像分类精度 stop_training = self.early_stopping.on_epoch_end(ep, val_image_cls_acc) if stop_training: break ''' self.dann_model.save(os.path.join(checkpoint_dir, "trained_dann_mnist2mnist_m.h5")) print('\n----------- end to train -----------\n') def train_one_epoch(self,train_source_datagen,train_target_datagen,train_iter_num,ep): """ 这是一个周期模型训练的函数 :param train_source_datagen: 源域训练数据集生成器 :param train_target_datagen: 目标域训练数据集生成器 :param train_iter_num: 一个训练周期的迭代次数 :param ep: 当前训练周期 :return: """ for i in np.arange(1, train_iter_num + 1): # 获取小批量数据集及其图像标签与域标签 batch_mnist_image_data, batch_mnist_labels = train_source_datagen.__next__() # train_source_datagen.next_batch() batch_mnist_m_image_data, batch_mnist_m_labels = train_target_datagen.__next__() # train_target_datagen.next_batch() batch_domain_labels = np.vstack([np.tile([1., 0.], [len(batch_mnist_labels), 1]), np.tile([0., 1.], [len(batch_mnist_m_labels), 1])]).astype(np.float32) batch_image_data = np.concatenate([batch_mnist_image_data, batch_mnist_m_image_data], axis=0) # 更新学习率并可视化 iter = (ep - 1) * train_iter_num + i process = iter * 1.0 / (self.cfg.epoch * train_iter_num) self.grl_lambd = grl_lambda_schedule(process) learning_rate = learning_rate_schedule(process, init_learning_rate=self.cfg.init_learning_rate) tf.keras.backend.set_value(self.optimizer.lr, learning_rate) with self.writer_hyperparameter.as_default(): tf.summary.scalar("hyperparameter/learning_rate", tf.convert_to_tensor(learning_rate), iter) tf.summary.scalar("hyperparameter/grl_lambda", tf.convert_to_tensor(self.grl_lambd), iter) # 计算图像分类损失梯度 with tf.GradientTape() as tape: # 计算图像分类预测输出、损失和精度 image_cls_feature = self.feature_encoder(batch_mnist_image_data) image_cls_pred = self.image_cls_encoder(image_cls_feature,training=True) image_cls_loss = self.loss(batch_mnist_labels,image_cls_pred) image_cls_acc = self.acc(batch_mnist_labels, image_cls_pred) # 计算域分类预测输出、损失和精度 domain_cls_feature = self.feature_encoder(batch_image_data) domain_cls_pred = self.domain_cls_encoder(self.grl(domain_cls_feature, self.grl_lambd), training=True) domain_cls_loss = self.loss(batch_domain_labels, domain_cls_pred) domain_cls_acc = self.acc(batch_domain_labels, domain_cls_pred) # 计算训练损失、图像分类精度和域分类精度 loss = tf.reduce_mean(image_cls_loss) + tf.reduce_mean(domain_cls_loss) # 自定义优化过程 vars = tape.watched_variables() grads = tape.gradient(loss, vars) self.optimizer.apply_gradients(zip(grads, vars)) # 计算平均损失与精度 self.train_loss(loss) self.train_image_cls_loss(image_cls_loss) self.train_domain_cls_loss(domain_cls_loss) self.train_image_cls_acc(image_cls_acc) self.train_domain_cls_acc(domain_cls_acc) # 更新进度条 self.progbar.update(i, [('loss', loss), ('image_cls_loss', image_cls_loss), ('domain_cls_loss', domain_cls_loss), ("image_acc", image_cls_acc), ("domain_acc", domain_cls_acc)]) # 可视化损失与指标 with self.writer_train.as_default(): tf.summary.scalar("loss/loss", self.train_loss.result(), ep) tf.summary.scalar("loss/image_cls_loss", self.train_image_cls_loss.result(), ep) tf.summary.scalar("loss/domain_cls_loss", self.train_domain_cls_loss.result(), ep) tf.summary.scalar("acc/image_cls_acc", self.train_image_cls_acc.result(), ep) tf.summary.scalar("acc/domain_cls_acc", self.train_domain_cls_acc.result(), ep) return self.train_loss.result(),self.train_image_cls_acc.result() def eval_one_epoch(self,val_target_datagen,val_iter_num,ep): """ 这是一个周期的模型验证函数 :param val_target_datagen: 目标域验证数据集生成器 :param val_iter_num: 一个验证周期的迭代次数 :param ep: 当前验证周期 :return: """ for i in np.arange(1, val_iter_num + 1): # 获取小批量数据集及其图像标签与域标签 batch_mnist_m_image_data, batch_mnist_m_labels = val_target_datagen.__next__() batch_mnist_m_domain_labels = np.tile([0., 1.], [len(batch_mnist_m_labels), 1]).astype(np.float32) # 计算目标域数据的图像分类预测输出和域分类预测输出 target_image_feature = self.feature_encoder(batch_mnist_m_image_data) target_image_cls_pred = self.image_cls_encoder(target_image_feature, training=False) target_domain_cls_pred = self.domain_cls_encoder(target_image_feature, training=False) # 计算目标域预测相关损失 target_image_cls_loss = self.loss(batch_mnist_m_labels,target_image_cls_pred) target_domain_cls_loss = self.loss(batch_mnist_m_domain_labels,target_domain_cls_pred) target_loss = tf.reduce_mean(target_image_cls_loss) + tf.reduce_mean(target_domain_cls_loss) # 计算目标域图像分类精度 image_cls_acc = self.acc(batch_mnist_m_labels, target_image_cls_pred) domain_cls_acc = self.acc(batch_mnist_m_domain_labels, target_domain_cls_pred) # 更新训练损失与训练精度 self.val_loss(target_loss) self.val_image_cls_loss(target_image_cls_loss) self.val_domain_cls_loss(domain_cls_acc) self.val_image_cls_acc(image_cls_acc) self.val_domain_cls_acc(domain_cls_acc) # 可视化验证损失及其指标 with self.writer_val.as_default(): tf.summary.scalar("loss/loss", self.val_loss.result(), ep) tf.summary.scalar("loss/image_cls_loss", self.val_image_cls_loss.result(), ep) tf.summary.scalar("loss/domain_cls_loss", self.val_domain_cls_loss.result(), ep) tf.summary.scalar("acc/image_cls_acc", self.val_image_cls_acc.result(), ep) tf.summary.scalar("acc/domain_cls_acc", self.val_domain_cls_acc.result(), ep) return self.val_loss.result(), self.val_image_cls_acc.result()
ckpt.restore(manager.latest_checkpoint) if manager.latest_checkpoint: print("Restored from {}".format(manager.latest_checkpoint)) else: print("Initializing from scratch.") for epoch in range(config['epochs']): for X, y in tqdm(train_ds): train_step(X, y) for X, y in tqdm(val_ds): val_step(X, y) template = '학습 에포크: {}, 손실: {}, 정확도: {}, 테스트 손실: {}, 테스트 정확도: {}' print (template.format(epoch+1, train_loss.result(), train_accuracy.result()*100, val_loss.result(), val_accuracy.result()*100)) # save checkpoint save_path = manager.save() print(f'Saved checkpoint for epoch {int(ckpt.step)}: {save_path}\n') ckpt.step.assign_add(1) print('\n학습이 완료됐습니다!!!!\n') print('<Model 정보 요약>') model.summary()
def low_level_train(optimizer, yolo_loss, train_datasets, valid_datasets, train_steps, valid_steps): """ 以底层的方式训练,这种方式更好地观察训练过程,监视变量的变化 :param optimizer: 优化器 :param yolo_loss: 自定义的loss function :param train_datasets: 以tf.data封装好的训练集数据 :param valid_datasets: 验证集数据 :param train_steps: 迭代一个epoch的轮次 :param valid_steps: 同上 :return: None """ # 创建模型结构 model = yolo_body() # 定义模型评估指标 train_loss = Mean(name='train_loss') valid_loss = Mean(name='valid_loss') # 设置保存最好模型的指标 best_test_loss = float('inf') patience = 10 min_delta = 1e-3 patience_cnt = 0 history_loss = [] # 创建summary summary_writer = tf.summary.create_file_writer(logdir=cfg.log_dir) # low level的方式计算loss for epoch in range(1, cfg.epochs + 1): train_loss.reset_states() valid_loss.reset_states() step = 0 print("Epoch {}/{}".format(epoch, cfg.epochs)) # 处理训练集数据 for batch, (images, labels) in enumerate(train_datasets.take(train_steps)): with tf.GradientTape() as tape: # 得到预测 outputs = model(images, training=True) # 计算损失(注意这里收集model.losses的前提是Conv2D的kernel_regularizer参数) regularization_loss = tf.reduce_sum(model.losses) pred_loss = [] # yolo_loss、label、output都是3个特征层的数据,通过for 拆包之后,一个loss_fn就是yolo_loss中一个特征层 # 然后逐一计算, for output, label, loss_fn in zip(outputs, labels, yolo_loss): pred_loss.append(loss_fn(label, output)) # 总损失 = yolo损失 + 正则化损失 total_train_loss = tf.reduce_sum(pred_loss) + regularization_loss # 反向传播梯度下降 # model.trainable_variables代表把loss反向传播到每个可以训练的变量中 grads = tape.gradient(total_train_loss, model.trainable_variables) # 将每个节点的误差梯度gradients,用于更新该节点的可训练变量值 # zip是把梯度和可训练变量值打包成元组 optimizer.apply_gradients(zip(grads, model.trainable_variables)) # 更新train_loss train_loss.update_state(total_train_loss) # 输出训练过程 rate = (step + 1) / train_steps a = "*" * int(rate * 70) b = "." * int((1 - rate) * 70) loss = train_loss.result().numpy() print("\r{}/{} {:^3.0f}%[{}->{}] - loss:{:.4f}". format(batch, train_steps, int(rate * 100), a, b, loss), end='') step += 1 # 计算验证集 for batch, (images, labels) in enumerate(valid_datasets.take(valid_steps)): # 得到预测,不training outputs = model(images) regularization_loss = tf.reduce_sum(model.losses) pred_loss = [] for output, label, loss_fn in zip(outputs, labels, yolo_loss): pred_loss.append(loss_fn(label, output)) total_valid_loss = tf.reduce_sum(pred_loss) + regularization_loss # 更新valid_loss valid_loss.update_state(total_valid_loss) print('\nLoss: {:.4f}, Test Loss: {:.4f}\n'.format(train_loss.result(), valid_loss.result())) # 保存loss,可以选择train的loss history_loss.append(valid_loss.result().numpy()) # 保存到tensorboard里 with summary_writer.as_default(): tf.summary.scalar('train_loss', train_loss.result(), step=optimizer.iterations) tf.summary.scalar('valid_loss', valid_loss.result(), step=optimizer.iterations) # 只保存最好模型 if valid_loss.result() < best_test_loss: best_test_loss = valid_loss.result() model.save_weights(cfg.model_path, save_format='tf') # EarlyStopping if epoch > 1 and history_loss[epoch - 2] - history_loss[epoch - 1] > min_delta: patience_cnt = 0 else: patience_cnt += 1 if patience_cnt >= patience: tf.print("No improvement for {} times, early stopping optimization.".format(patience)) break
class Pix2Pose(Model): def __init__(self, image_shape, discriminator, generator, latent_dim): super(Pix2Pose, self).__init__() self.image_shape = image_shape self.discriminator = discriminator self.generator = generator self.latent_dim = latent_dim @property def metrics(self): return [self.generator_loss, self.discriminator_loss] def compile(self, optimizers, losses, loss_weights): super(Pix2Pose, self).compile() self.optimizer_generator = optimizers['generator'] self.optimizer_discriminator = optimizers['discriminator'] self.compute_reconstruction_loss = losses['weighted_reconstruction'] self.compute_error_prediction_loss = losses['error_prediction'] self.compute_discriminator_loss = losses['discriminator'] self.generator_loss = Mean(name='generator_loss') self.discriminator_loss = Mean(name='discriminator_loss') self.reconstruction_loss = Mean(name='weighted_reconstruction') self.error_prediction_loss = Mean(name='error_prediction') self.reconstruction_weight = loss_weights['weighted_reconstruction'] self.error_prediction_weight = loss_weights['error_prediction'] def _build_discriminator_labels(self, batch_size): return tf.concat([tf.ones(batch_size, 1), tf.zeros(batch_size, 1)], 0) def _add_noise_to_labels(self, labels): noise = tf.random.uniform(tf.shape(labels)) labels = labels + 0.05 * noise return labels def _get_batch_size(self, values): return tf.shape(values)[0] def _train_discriminator(self, RGB_inputs, RGBA_true): RGB_true = RGBA_true[:, :, :, 0:3] RGB_fake = self.generator(RGB_inputs)[:, :, :, 0:3] RGB_fake_true = tf.concat([RGB_fake, RGB_true], axis=0) batch_size = self._get_batch_size(RGB_inputs) y_true = self._build_discriminator_labels(batch_size) y_true = self._add_noise_to_labels(y_true) with tf.GradientTape() as tape: y_pred = self.discriminator(RGB_fake_true) discriminator_loss = self.compute_discriminator_loss( y_true, y_pred) gradients = tape.gradient(discriminator_loss, self.discriminator.trainable_weights) self.optimizer_discriminator.apply_gradients( zip(gradients, self.discriminator.trainable_weights)) return discriminator_loss def _train_generator(self, RGB_inputs): batch_size = tf.shape(RGB_inputs)[0] y_misleading = tf.zeros((batch_size, 1)) with tf.GradientTape() as tape: RGBE_preds = self.generator(RGB_inputs) y_pred = self.discriminator(RGBE_preds[..., 0:3]) generator_loss = self.compute_discriminator_loss( y_misleading, y_pred) gradients = tape.gradient(generator_loss, self.generator.trainable_weights) self.optimizer_generator.apply_gradients( zip(gradients, self.generator.trainable_weights)) return generator_loss def _train_reconstruction(self, RGB_inputs, RGBA_true): with tf.GradientTape() as tape: RGBE_pred = self.generator(RGB_inputs) reconstruction_loss = self.compute_reconstruction_loss( RGBA_true, RGBE_pred) reconstruction_loss = (self.reconstruction_weight * reconstruction_loss) gradients = tape.gradient(reconstruction_loss, self.generator.trainable_weights) self.optimizer_generator.apply_gradients( zip(gradients, self.generator.trainable_weights)) return reconstruction_loss def _train_error_prediction(self, RGB_inputs, RGBA_true): with tf.GradientTape() as tape: RGBE_pred = self.generator(RGB_inputs) error_prediction_loss = self.compute_error_prediction_loss( RGBA_true, RGBE_pred) error_prediction_loss = (self.error_prediction_weight * error_prediction_loss) gradients = tape.gradient(error_prediction_loss, self.generator.trainable_weights) self.optimizer_generator.apply_gradients( zip(gradients, self.generator.trainable_weights)) return error_prediction_loss def train_step(self, data): RGB_inputs, RGBA_true = data[0]['RGB_input'], data[1]['RGB_with_error'] reconstruction_loss = self._train_reconstruction(RGB_inputs, RGBA_true) self.reconstruction_loss.update_state(reconstruction_loss) error_loss = self._train_error_prediction(RGB_inputs, RGBA_true) self.error_prediction_loss.update_state(error_loss) discriminator_loss = self._train_discriminator(RGB_inputs, RGBA_true) self.discriminator_loss.update_state(discriminator_loss) generator_loss = self._train_generator(RGB_inputs) self.generator_loss.update_state(generator_loss) return { 'discriminator_loss': self.discriminator_loss.result(), 'generator_loss': self.generator_loss.result(), 'reconstruction_loss': self.reconstruction_loss.result(), 'error_prediction_loss': self.error_prediction_loss.result() }
optimizer.apply_gradients(zip(grads, teacher_model.trainable_variables)) loss_value_test = training.loss([x_val_main_, x_val_aux_], y_val_) epoch_loss_avg(loss_value) epoch_accuracy( y_train_, tf.nn.softmax(teacher_model([x_train_main_, x_train_aux_]))) epoch_loss_avg_val(loss_value_test) epoch_accuracy_val( y_val_, tf.nn.softmax(teacher_model([x_val_main_, x_val_aux_]))) # 学習進捗の表示 print( 'Epoch {}/{}: Loss: {:.3f}, Accuracy: {:.3%}, Validation Loss: {:.3f}, Validation Accuracy: {:.3%}' .format(epoch, EPOCHS_T, epoch_loss_avg.result(), epoch_accuracy.result(), epoch_loss_avg_val.result(), epoch_accuracy_val.result())) # Studentモデルの定義 student = KDModel.Students(NUM_CLASSES, T) student_model = student.createModel(inputs_main) # Studentモデルの学習 student_model.summary() # plot_model(student_soft_model, show_shapes=True, to_file='student_model.png') kd = KDModel.KnowledgeDistillation(teacher_model, student_model, T, ALPHA) history_student = LossAccHistory() for epoch in range(1, EPOCHS_S + 1): epoch_loss_avg = Mean() epoch_loss_avg_val = Mean()