def train(inputs, data): model = unet(inputs) model.summary() train_idx, mask_count_df, train_df, val_idx = data config = Config() train_generator = DataGenerator(train_idx, df=mask_count_df, target_df=train_df, batch_size=config.batch_size, reshape=(config.height, config.width), augment=True, graystyle=False, shuffle=True, n_channels=config.channels, n_classes=config.n_classes) train_eval_generator = DataGenerator(train_idx, df=mask_count_df, target_df=train_df, batch_size=config.batch_size, reshape=(config.height, config.width), augment=False, graystyle=False, shuffle=False, n_channels=config.channels, n_classes=config.n_classes) val_generator = DataGenerator(val_idx, df=mask_count_df, target_df=train_df, batch_size=config.batch_size, reshape=(config.height, config.width), augment=False, graystyle=False, shuffle=False, n_channels=config.channels, n_classes=config.n_classes) earlystopping = EarlyStopping(monitor='loss', patience=config.es_patience) reduce_lr = ReduceLROnPlateau(monitor='loss', patience=config.rlrop_patience, factor=config.decay_drop, min_lr=1e-6) checkpoint = ModelCheckpoint(filepath='weights-{epoch:03d}-{loss:.2f}.h5', monitor='loss', save_best_only=False, save_weights_only=True) metric_list = [dice_coef] callback_list = [earlystopping, reduce_lr, checkpoint] optimizer = Adam(lr=config.learning_rate) model.compile(optimizer=optimizer, loss=bce_dice_loss, metrics=metric_list) checkpoint.set_model(model) model.fit_generator(train_generator, validation_data=val_generator, callbacks=callback_list, epochs=100, initial_epoch=0)
def initLogging(params, modelD, modelG): # Generate actual output folder with timestamp timestring = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') outfolder = os.path.join(os.getcwd(), params['logdir'].split('./')[1], timestring) os.makedirs(outfolder, exist_ok = True) # Initialize TensorBoard tb = TensorBoard(log_dir = outfolder, batch_size = params['batchsize'], write_graph = True, write_grads = True, write_images = True) tb.set_model(modelG) # Initialize checkpointing #mcD = ModelCheckpoint(filepath=os.path.join(outfolder, 'discriminator.epoch{epoch:04d}-epe{epe:9.5f}.h5'), mcD = ModelCheckpoint(filepath=os.path.join(outfolder, 'discriminator.h5'), save_best_only = True, monitor='epe', mode='min') mcD.set_model(modelD) #mcG = ModelCheckpoint(filepath=os.path.join(outfolder, 'generator.epoch{epoch:04d}-epe{epe:9.5f}.h5'), mcG = ModelCheckpoint(filepath=os.path.join(outfolder, 'generator.h5'), save_best_only = True, monitor='epe', mode='min') mcG.set_model(modelG) # Save JSON representation of models with open(os.path.join(outfolder, "discriminator.json"), "w") as json_fileD: json_fileD.write(modelD.to_json()) with open(os.path.join(outfolder, "generator.json"), "w") as json_fileG: json_fileG.write(modelG.to_json()) # Save params to file with open(os.path.join(outfolder, "params_train.log"), "w") as paramfile: paramfile.write(repr(params)) # Clone stdout to file sys.stdout = Logger(sys.stdout, os.path.join(outfolder, "console_train.log")) return [tb, mcD, mcG, outfolder]
def train_rtvsrgan(self, epochs=None, batch_size=None, modelname=None, datapath_train=None, datapath_validation=None, steps_per_validation=None, datapath_test=None, workers=None, max_queue_size=None, first_epoch=None, print_frequency=None, crops_per_image=None, log_weight_frequency=None, log_weight_path=None, log_tensorboard_path=None, log_tensorboard_update_freq=None, log_test_frequency=None, log_test_path=None, media_type='i'): """Train the ESRGAN network :param int epochs: how many epochs to train the network for :param str modelname: name to use for storing model weights etc. :param str datapath_train: path for the image files to use for training :param str datapath_test: path for the image files to use for testing / plotting :param int print_frequency: how often (in epochs) to print progress to terminal. Warning: will run validation inference! :param int log_weight_frequency: how often (in epochs) should network weights be saved. None for never :param int log_weight_path: where should network weights be saved :param int log_test_frequency: how often (in epochs) should testing & validation be performed :param str log_test_path: where should test results be saved :param str log_tensorboard_path: where should tensorflow logs be sent """ # Create data loaders train_loader = DataLoader(datapath_train, batch_size, self.height_hr, self.width_hr, self.upscaling_factor, crops_per_image, media_type, self.channels, self.colorspace) # Validation data loader validation_loader = None if datapath_validation is not None: validation_loader = DataLoader(datapath_validation, batch_size, self.height_hr, self.width_hr, self.upscaling_factor, crops_per_image, media_type, self.channels, self.colorspace) test_loader = None if datapath_test is not None: test_loader = DataLoader(datapath_test, 1, self.height_hr, self.width_hr, self.upscaling_factor, 1, media_type, self.channels, self.colorspace) # Use several workers on CPU for preparing batches enqueuer = OrderedEnqueuer(train_loader, use_multiprocessing=True, shuffle=True) enqueuer.start(workers=workers, max_queue_size=max_queue_size) output_generator = enqueuer.get() # Callback: save weights after each epoch modelcheckpoint = ModelCheckpoint(os.path.join( log_weight_path, modelname + '2_{}X.h5'.format(self.upscaling_factor)), monitor='Perceptual_loss', save_best_only=True, save_weights_only=True, mode='min', verbose=1) modelcheckpoint.set_model(self.generator) # Callback: tensorboard if log_tensorboard_path: tensorboard = TensorBoard(log_dir=os.path.join( log_tensorboard_path, modelname), histogram_freq=0, batch_size=batch_size, write_graph=True, write_grads=True, update_freq=log_tensorboard_update_freq) tensorboard.set_model(self.rtvsrgan) else: print( ">> Not logging to tensorboard since no log_tensorboard_path is set" ) # Learning rate scheduler def lr_scheduler(epoch, lr): factor = 0.5 decay_step = [500, 1000, 1500, 2000] if epoch in decay_step and epoch: return lr * factor return lr lr_scheduler_gan = LearningRateScheduler(lr_scheduler, verbose=1) lr_scheduler_gan.set_model(self.rtvsrgan) lr_scheduler_gen = LearningRateScheduler(lr_scheduler, verbose=0) lr_scheduler_gen.set_model(self.generator) lr_scheduler_dis = LearningRateScheduler(lr_scheduler, verbose=0) lr_scheduler_dis.set_model(self.discriminator) lr_scheduler_ra = LearningRateScheduler(lr_scheduler, verbose=0) lr_scheduler_ra.set_model(self.ra_discriminator) # Callback: format input value def named_logs(model, logs): """Transform train_on_batch return value to dict expected by on_batch_end callback""" result = {} for l in zip(model.metrics_names, logs): result[l[0]] = l[1] return result # Shape of output from discriminator disciminator_output_shape = list(self.ra_discriminator.output_shape) disciminator_output_shape[0] = batch_size disciminator_output_shape = tuple(disciminator_output_shape) # VALID / FAKE targets for discriminator real = np.ones(disciminator_output_shape) fake = np.zeros(disciminator_output_shape) # Each epoch == "update iteration" as defined in the paper print_losses = {"GAN": [], "D": []} start_epoch = datetime.datetime.now() # Random images to go through #idxs = np.random.randint(0, len(train_loader), epochs) # Loop through epochs / iterations for epoch in range(first_epoch, int(epochs) + first_epoch): lr_scheduler_gan.on_epoch_begin(epoch) lr_scheduler_ra.on_epoch_begin(epoch) lr_scheduler_dis.on_epoch_begin(epoch) lr_scheduler_gen.on_epoch_begin(epoch) # Start epoch time if epoch % print_frequency == 0: print("\nEpoch {}/{}:".format(epoch + 1, epochs + first_epoch)) start_epoch = datetime.datetime.now() # Train discriminator self.discriminator.trainable = True self.ra_discriminator.trainable = True imgs_lr, imgs_hr = next(output_generator) generated_hr = self.generator.predict(imgs_lr) real_loss = self.ra_discriminator.train_on_batch( [imgs_hr, generated_hr], real) #print("Real: ",real_loss) fake_loss = self.ra_discriminator.train_on_batch( [generated_hr, imgs_hr], fake) #print("Fake: ",fake_loss) discriminator_loss = 0.5 * np.add(real_loss, fake_loss) # Train generator self.discriminator.trainable = False self.ra_discriminator.trainable = False for _ in tqdm(range(10), ncols=60, desc=">> Training generator"): imgs_lr, imgs_hr = next(output_generator) gan_loss = self.rtvsrgan.train_on_batch( [imgs_lr, imgs_hr], [imgs_hr, real, imgs_hr]) # Callbacks logs = named_logs(self.rtvsrgan, gan_loss) tensorboard.on_epoch_end(epoch, logs) # Callbacks if datapath_validation: validation_losses = self.generator.evaluate_generator( validation_loader, steps=steps_per_validation, use_multiprocessing=False, #workers>1, workers=1) #logs = named_logs(self.generator, validation_losses) modelcheckpoint.on_epoch_end(epoch, logs) # Save losses print_losses['GAN'].append(gan_loss) print_losses['D'].append(discriminator_loss) # Show the progress if epoch % print_frequency == 0: g_avg_loss = np.array(print_losses['GAN']).mean(axis=0) d_avg_loss = np.array(print_losses['D']).mean(axis=0) print(">> Time: {}s\n>> GAN: {}\n>> Discriminator: {}".format( (datetime.datetime.now() - start_epoch).seconds, ", ".join([ "{}={:.4f}".format(k, v) for k, v in zip( self.rtvsrgan.metrics_names, g_avg_loss) ]), ", ".join([ "{}={:.4f}".format(k, v) for k, v in zip( self.discriminator.metrics_names, d_avg_loss) ]))) print_losses = {"GAN": [], "D": []} # Run validation inference if specified if datapath_validation: print(">> Validation Losses: {}".format(", ".join([ "{}={:.4f}".format(k, v) for k, v in zip( self.generator.metrics_names, validation_losses) ]))) # If test images are supplied, run model on them and save to log_test_path if datapath_test and epoch % log_test_frequency == 0: plot_test_images(self.generator, test_loader, datapath_test, log_test_path, epoch, modelname, channels=self.channels, colorspace=self.colorspace) # Check if we should save the network weights if log_weight_frequency and epoch % log_weight_frequency == 0: # Save the network weights self.save_weights(os.path.join(log_weight_path, modelname))
test_loader = gen('../test/' + tag + '/tmp_labels.txt', '../test/' + tag + '/', batchsize=batch_size, maxlabellength=maxlabellength, imagesize=(img_h, img_w)) #train_loader = gen('../all/train.txt', '../all/', batchsize=batch_size, maxlabellength=maxlabellength, imagesize=(img_h, img_w)) #test_loader = gen('../all/test.txt', '../all/', batchsize=batch_size, maxlabellength=maxlabellength, imagesize=(img_h, img_w)) #train_loader = gen('../all/train_13_100.txt', '../all/', batchsize=batch_size, maxlabellength=maxlabellength, imagesize=(img_h, img_w)) #test_loader = gen('../all/test_13_100.txt', '../all/', batchsize=batch_size, maxlabellength=maxlabellength, imagesize=(img_h, img_w)) checkpoint = ModelCheckpoint(filepath='./models/' + tag + '/weights_' + tag + '_shufflenet-{epoch:02d}-{val_loss:.2f}.h5', monitor='val_loss', save_best_only=False, save_weights_only=True) checkpoint.set_model(save_model) #lr_schedule = lambda epoch: 0.0005 * 0.4**epoch #lr_schedule = lambda epoch: 0.005 * 20 * 0.4 / (epoch + 1) #lr_schedule = lambda epoch: 0.00135 * 2 * 0.33**epoch lr_schedule = lambda epoch: 0.0005 * 1 * 0.55**epoch learning_rate = np.array([lr_schedule(i) for i in range(30)]) changelr = LearningRateScheduler(lambda epoch: float(learning_rate[epoch])) earlystop = EarlyStopping(monitor='val_loss', patience=2, verbose=1) tensorboard = TensorBoard(log_dir='./models/logs', write_graph=True) print('-----------Start training-----------') model.fit_generator( train_loader, steps_per_epoch=train_size // batch_size, epochs=30, initial_epoch=0,
# ??? # model = primary_net() loss = 'mse' metrics = ['mae', 'mse', 'mape', 'msle', 'logcosh', 'cosine'] model.compile(loss=loss, loss_weights=[0, 0, 1, 1, 1e4, 1], optimizer=Adam(lr=lr, beta_1=beta_1), metrics=metrics) model.summary() # model = load_model('model28log.h5', custom_objects=custom_objects) # model_name = input('Enter model name: ') model_name = 'model_final_16x16' print('model name: ' + model_name) checkpoint = ModelCheckpoint(model_name + '_{epoch:d}.h5', period=1) checkpoint.set_model(model) checkpoint_weight = ModelCheckpoint(model_name + '_weights_{epoch:d}.h5', period=1, save_weights_only=True) checkpoint_weight.set_model(model) # early_stop = EarlyStopping(monitor='val_1_loss', min_delta=0.1, patience=30, restore_best_weights=True, verbose=1) callbacks = [TimeHistory(), checkpoint, checkpoint_weight] # callbacks = [TimeHistory()] # callbacks = [TimeHistory(), EarlyStopping(monitor='loss', min_delta=0.1, patience=100, # restore_best_weights=True, verbose=1)] # early_stop = callbacks.EarlyStopping(patience=200, verbose=1) # with open('model_hist_data28log.json', 'r') as f: # hist_data = json.load(f) dataset = datasets[key] # data_size = dataset.data_size # val_size = dataset.val_size
try: pre_trained_weights = 'resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5' model.load_weights(pre_trained_weights, by_name=True) except Exception as e: print('load pre-trained weights error {}'.format(e)) for cls, idx in train_generator.class_indices.items(): print('Class #{} = {}'.format(idx, cls)) checkpoint = ModelCheckpoint( filepath='weights/weights-{epoch:03d}-{loss:.2f}.h5', monitor='loss', save_best_only=False, save_weights_only=True) checkpoint.set_model(model) model.compile(optimizer=Adam(lr=1e-5), loss='categorical_crossentropy', metrics=['accuracy']) lr_reducer = ReduceLROnPlateau(monitor='loss', factor=np.sqrt(0.1), cooldown=0, patience=2, min_lr=0.5e-6) earlystopping = EarlyStopping(monitor='loss', patience=5, verbose=1) tensorbord = TensorBoard(log_dir='weights/logs', write_graph=True)
else: start_epoch_num = 0 spatial_stream = spatial.basic() print('set network') print('complete') sgd = SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True) spatial_stream.compile(optimizer=sgd, loss='categorical_crossentropy', metrics=['accuracy']) print('complete network setting') tmp_numiter = len(train_loader.get_train_data_list())/batch_size num_iter = int(tmp_numiter)+1 if tmp_numiter - int(tmp_numiter) > 0 else int(tmp_numiter) tbCallBack.set_model(spatial_stream) mcCallBack.set_model(spatial_stream) loss_session = tf.Session() best_val_acc = 0 for epoch in range(start_epoch_num, start_epoch_num + num_epoch): print('Epoch', epoch) train_acc, train_loss = train_1epoch(spatial_stream, train_loader, num_iter) print("train_loss:", train_loss, "train_acc:", train_acc) tr_val_acc, tr_val_loss = validation_1epoch(spatial_stream, train_val_loader, sess) print("tr_val_loss:", tr_val_loss, "tr_val_acc:", tr_val_acc) val_acc, val_loss = validation_1epoch(spatial_stream, test_loader, sess) print("val_loss:", val_loss, "val_acc:", val_acc)
def run_hcomb_final(h, ID, hcm, model_dir, INTERMEDIATE_PLOTS, GLOBAL_GRADIENT_NORM_PLOT): ################################################# FINAL EXPERIMENT start = timer() ALL_FOLDS = list(range(1, 7)) print(5 * '\n' + 'Starting Final Experiment, training {} epochs...\n'.format(h.MAX_EPOCHS)) model_save_dir = os.path.join(model_dir, 'final') os.makedirs(model_save_dir, exist_ok=True) ################################################# MODEL DEFINITION reg = None adam_kwargs = {'clipnorm': 1.0} kernel_initializer_dense = 'glorot_uniform' if 'changbin' in model_dir: from keras import regularizers reg = regularizers.l2(0.0000459) subsample_time_steps = False if h.TIME_STEPS >= 2000: subsample_time_steps = True if h.TIME_STEPS >= 2000: from keras import regularizers reg = regularizers.l2(0.001) adam_kwargs['clipvalue'] = 0.3 adam_kwargs['clipnorm'] = 0.7 kernel_initializer_dense = 'he_uniform' # should prevent exploding grads for ReLU print('\nBuild model...\n') time_steps = h.TIME_STEPS if not subsample_time_steps else h.TIME_STEPS // 2 x = Input(batch_shape=(h.BATCH_SIZE, time_steps, h.N_FEATURES), name='Input', dtype='float32') y = x # Input dropout y = Dropout(h.INPUT_DROPOUT, noise_shape=(h.BATCH_SIZE, 1, h.N_FEATURES))(y) for units in h.UNITS_PER_LAYER_LSTM: y = CuDNNLSTM(units, return_sequences=True, stateful=True, kernel_regularizer=reg, recurrent_regularizer=reg)(y) # LSTM Output dropout y = Dropout(h.LSTM_OUTPUT_DROPOUT, noise_shape=(h.BATCH_SIZE, 1, units))(y) for units in h.UNITS_PER_LAYER_MLP: if units != h.N_CLASSES: y = Dense(units, activation='relu', kernel_regularizer=reg, kernel_initializer=kernel_initializer_dense)(y) else: y = Dense(units, activation='linear', kernel_regularizer=reg)(y) # MLP Output dropout but not last layer if units != h.N_CLASSES: y = Dropout(h.MLP_OUTPUT_DROPOUT, noise_shape=(h.BATCH_SIZE, 1, units))(y) model = Model(x, y) model.summary() print(5 * '\n') my_loss = tensorflow_utils.my_loss_builder(h.MASK_VAL, tensorflow_utils.get_loss_weights(ALL_FOLDS, h.TRAIN_SCENES, h.LABEL_MODE)) ################################################# LOAD CHECKPOINTED MODEL model_is_resumed = False epochs_finished_old = None # use val_fold = 0 as dummy for finished epochs val_fold = 1 val_fold_str = 'final_experiment: {} ({} / {})'.format(val_fold, 1, 1) latest_weights_path, epochs_finished, val_acc, best_epoch_, best_val_acc_, epochs_without_improvement_ \ = tensorflow_utils.latest_training_state(model_save_dir) if latest_weights_path is not None: model.load_weights(latest_weights_path) model_is_resumed = True if h.epochs_finished[val_fold - 1] != epochs_finished: epochs_finished_old = h.epochs_finished[val_fold - 1] print( 'MISMATCH: Latest state in hyperparameter combination list is different to checkpointed state.') h.epochs_finished[val_fold - 1] = epochs_finished h.val_acc[val_fold - 1] = val_acc hcm.replace_at_id(ID, h) ################################################# COMPILE MODEL adam = Adam(lr=h.LEARNING_RATE, **adam_kwargs) model.compile(optimizer=adam, loss=my_loss, metrics=None, sample_weight_mode='temporal') print('\nModel compiled.\n') ################################################# DATA LOADER use_multithreading = True BUFFER = utils.get_buffer_size_wrt_time_steps(h.TIME_STEPS) train_loader = tr_utils.create_train_dataloader(h.LABEL_MODE, ALL_FOLDS, -1, h.BATCH_SIZE, h.TIME_STEPS, h.MAX_EPOCHS, 160, 13, BUFFER=BUFFER, use_multithreading=use_multithreading) ################################################# CALLBACKS model_ckp_last = ModelCheckpoint(os.path.join(model_save_dir, 'model_ckp_epoch_{epoch:02d}-val_acc_{val_final_acc:.3f}.hdf5'), verbose=1, monitor='val_final_acc') model_ckp_last.set_model(model) args = [h.OUTPUT_THRESHOLD, h.MASK_VAL, h.MAX_EPOCHS, val_fold_str, GLOBAL_GRADIENT_NORM_PLOT, h.RECURRENT_DROPOUT, h.METRIC] # training phase train_phase = tr_utils.Phase('train', model, train_loader, BUFFER, *args, no_new_weighting=True if 'nnw' in model_save_dir else False, changbin_recurrent_dropout=True if 'changbin' in model_dir else False, subsample_time_steps=subsample_time_steps) if model_is_resumed: try: old_metrics = utils.load_metrics(model_save_dir, name="metrics_train") # merge metrics h.METRIC = old_metrics['metric'] train_phase.metric = h.METRIC train_iterations_done = old_metrics['train_losses'].shape[0] epochs_done = old_metrics['train_accs'].shape[0] if epochs_finished_old is not None: epochs_done_old = epochs_done epochs_done = epochs_done if epochs_finished > epochs_done else epochs_finished train_iterations_done = int(train_iterations_done / epochs_done_old) * epochs_done train_phase.losses = old_metrics['train_losses'].tolist()[:train_iterations_done] train_phase.accs = old_metrics['train_accs'].tolist()[:epochs_done] train_phase.sens_spec_class_scene = old_metrics['train_sens_spec_class_scene'].tolist()[:epochs_done] if 'global_gradient_norm' in old_metrics: train_phase.global_gradient_norms = old_metrics['global_gradient_norm'].tolist()[ :train_iterations_done] except: pass train_phase.resume_from_epoch(h.epochs_finished[val_fold - 1] + 1) for e in range(h.epochs_finished[val_fold - 1], h.MAX_EPOCHS): train_loss_is_nan, _ = train_phase.run() if train_loss_is_nan: print('\n\n\n---------------------------------------\n\n\n') print("ERROR: Training loss is NaN.") print('\n\n\n---------------------------------------\n\n\n') break tr_utils.update_latest_model_ckp(model_ckp_last, model_save_dir, e, 0.0) metrics = { 'metric': h.METRIC, 'train_losses': np.array(train_phase.losses), 'train_accs': np.array(train_phase.accs), 'train_sens_spec_class_scene': np.array(train_phase.sens_spec_class_scene), } if GLOBAL_GRADIENT_NORM_PLOT: metrics['global_gradient_norm'] = np.array(train_phase.global_gradient_norms) utils.pickle_metrics(metrics, model_save_dir, name="metrics_train") hcm.finish_epoch(ID, h, 0.0, 0.0, 0.0, 0.0, val_fold - 1, e + 1, e + 1, (timer() - start) / 60) if INTERMEDIATE_PLOTS: plot.plot_metrics(metrics, model_save_dir) if GLOBAL_GRADIENT_NORM_PLOT: plot.plot_global_gradient_norm(np.array(train_phase.global_gradient_norms), model_save_dir, epochs_done=e + 1) del model K.clear_session() hcm.finish_hcomb(ID, h) ################################################## TESTING test_loader = tr_utils.create_test_dataloader(h.LABEL_MODE) ################################################# MODEL DEFINITION print('\nBuild model for testing...\n') x = Input(batch_shape=(1, None, h.N_FEATURES), name='Input', dtype='float32') y = x # Input dropout y = Dropout(h.INPUT_DROPOUT, noise_shape=(1, 1, h.N_FEATURES))(y) for units in h.UNITS_PER_LAYER_LSTM: y = CuDNNLSTM(units, return_sequences=True, stateful=True, kernel_regularizer=reg, recurrent_regularizer=reg)(y) # LSTM Output dropout y = Dropout(h.LSTM_OUTPUT_DROPOUT, noise_shape=(1, 1, units))(y) for units in h.UNITS_PER_LAYER_MLP: if units != h.N_CLASSES: y = Dense(units, activation='relu', kernel_regularizer=reg, kernel_initializer=kernel_initializer_dense)(y) else: y = Dense(units, activation='linear', kernel_regularizer=reg)(y) # MLP Output dropout but not last layer if units != h.N_CLASSES: y = Dropout(h.MLP_OUTPUT_DROPOUT, noise_shape=(1, 1, units))(y) model = Model(x, y) model.summary() latest_weights_path, _, _, _, _, _ = tensorflow_utils.latest_training_state(model_save_dir) if latest_weights_path is not None: model.load_weights(latest_weights_path) model.compile(optimizer=adam, loss=my_loss, metrics=None) print('\nModel compiled.\n') test_phase = tr_utils.TestPhase(model, test_loader, h.OUTPUT_THRESHOLD, h.MASK_VAL, 1, val_fold_str, model_save_dir, metric=('BAC', 'BAC2'), ret=('final', 'per_class', 'per_class_scene', 'per_scene')) test_loss_is_nan, _ = test_phase.run() metrics_test = { 'metric': h.METRIC, 'test_accs': np.array(test_phase.accs), 'test_accs_bac2': np.array(test_phase.accs_bac2), 'test_class_accs': np.array(test_phase.class_accs), 'test_class_accs_bac2': np.array(test_phase.class_accs_bac2), 'test_class_scene_accs': np.array(test_phase.class_scene_accs), 'test_class_scene_accs_bac2': np.array(test_phase.class_scene_accs_bac2), 'test_scene_accs': np.array(test_phase.scene_accs), 'test_scene_accs_bac2': np.array(test_phase.scene_accs_bac2), 'test_sens_spec_class_scene': np.array(test_phase.sens_spec_class_scene), 'test_sens_spec_class': np.array(test_phase.sens_spec_class) } utils.pickle_metrics(metrics_test, model_save_dir) else: print('\n\n\n---------------------------------------\n\n\n') print("ERROR: No testing possible, because no trained model saved.") print('\n\n\n---------------------------------------\n\n\n') go_to_next_stage = False return go_to_next_stage
def run_hcomb_cv(h, ID, hcm, model_dir, INTERMEDIATE_PLOTS, GLOBAL_GRADIENT_NORM_PLOT): ################################################# CROSS VALIDATION start = timer() NUMBER_OF_CLASSES = 13 # METRICS ALL_FOLDS = h.ALL_FOLDS if h.ALL_FOLDS != -1 else list(range(1, 7)) best_val_class_accuracies_over_folds = [[0] * NUMBER_OF_CLASSES] * len(ALL_FOLDS) best_val_acc_over_folds = [0] * len(ALL_FOLDS) best_val_class_accuracies_over_folds_bac2 = [[0] * NUMBER_OF_CLASSES] * len(ALL_FOLDS) best_val_acc_over_folds_bac2 = [0] * len(ALL_FOLDS) go_to_next_stage = False subsample_time_steps = False if h.TIME_STEPS >= 2000: subsample_time_steps = True print(5 * '\n' + 'Starting Cross Validation STAGE {}...\n'.format(h.STAGE)) for i_val_fold, val_fold in enumerate(h.VAL_FOLDS): model_save_dir = os.path.join(model_dir, 'val_fold{}'.format(val_fold)) os.makedirs(model_save_dir, exist_ok=True) TRAIN_FOLDS = list(set(ALL_FOLDS).difference({val_fold})) val_fold_str = 'val_fold: {} ({} / {})'.format(val_fold, i_val_fold + 1, len(h.VAL_FOLDS)) ################################################# MODEL DEFINITION print('\nBuild model...\n') time_steps = h.TIME_STEPS if not subsample_time_steps else h.TIME_STEPS // 2 x = Input(batch_shape=(h.BATCH_SIZE, time_steps, h.N_FEATURES), name='Input', dtype='float32') y = x # Input dropout y = Dropout(h.INPUT_DROPOUT, noise_shape=(h.BATCH_SIZE, 1, h.N_FEATURES))(y) for units in h.UNITS_PER_LAYER_LSTM: y = CuDNNLSTM(units, return_sequences=True, stateful=True)(y) # LSTM Output dropout y = Dropout(h.LSTM_OUTPUT_DROPOUT, noise_shape=(h.BATCH_SIZE, 1, units))(y) for units in h.UNITS_PER_LAYER_MLP: if units != h.N_CLASSES: y = Dense(units, activation='relu')(y) else: y = Dense(units, activation='linear')(y) # MLP Output dropout but not last layer if units != h.N_CLASSES: y = Dropout(h.MLP_OUTPUT_DROPOUT, noise_shape=(h.BATCH_SIZE, 1, units))(y) model = Model(x, y) model.summary() print(5 * '\n') my_loss = tensorflow_utils.my_loss_builder(h.MASK_VAL, tensorflow_utils.get_loss_weights(TRAIN_FOLDS, h.TRAIN_SCENES, h.LABEL_MODE)) ################################################# LOAD CHECKPOINTED MODEL model_is_resumed = False epochs_finished_old = None latest_weights_path, epochs_finished, val_acc, best_epoch_, best_val_acc_, epochs_without_improvement_ \ = tensorflow_utils.latest_training_state(model_save_dir) if latest_weights_path is not None: model.load_weights(latest_weights_path) model_is_resumed = True if h.epochs_finished[val_fold - 1] != epochs_finished: epochs_finished_old = h.epochs_finished[val_fold - 1] print( 'MISMATCH: Latest state in hyperparameter combination list is different to checkpointed state.') h.epochs_finished[val_fold - 1] = epochs_finished h.val_acc[val_fold - 1] = val_acc hcm.replace_at_id(ID, h) ################################################# COMPILE MODEL adam = Adam(lr=h.LEARNING_RATE, clipnorm=1.) model.compile(optimizer=adam, loss=my_loss, metrics=None, sample_weight_mode='temporal') print('\nModel compiled.\n') ################################################# DATA LOADER use_multithreading = True BUFFER = utils.get_buffer_size_wrt_time_steps(h.TIME_STEPS) train_loader, val_loader = tr_utils.create_dataloaders(h.LABEL_MODE, TRAIN_FOLDS, h.TRAIN_SCENES, h.BATCH_SIZE, h.TIME_STEPS, h.MAX_EPOCHS, h.N_FEATURES, h.N_CLASSES, [val_fold], h.VAL_STATEFUL, BUFFER=BUFFER, use_multithreading=use_multithreading, subsample_time_steps=subsample_time_steps) ################################################# CALLBACKS model_ckp_last = ModelCheckpoint(os.path.join(model_save_dir, 'model_ckp_epoch_{epoch:02d}-val_acc_{val_final_acc:.3f}.hdf5'), verbose=1, monitor='val_final_acc') model_ckp_last.set_model(model) model_ckp_best = ModelCheckpoint(os.path.join(model_save_dir, 'best_model_ckp_epoch_{epoch:02d}-val_acc_{val_final_acc:.3f}.hdf5'), verbose=1, monitor='val_final_acc', save_best_only=True) model_ckp_best.set_model(model) args = [h.OUTPUT_THRESHOLD, h.MASK_VAL, h.MAX_EPOCHS, val_fold_str, GLOBAL_GRADIENT_NORM_PLOT, h.RECURRENT_DROPOUT, h.METRIC] # training phase train_phase = tr_utils.Phase('train', model, train_loader, BUFFER, *args, no_new_weighting=True if 'nnw' in model_save_dir else False, subsample_time_steps=subsample_time_steps) # validation phase val_phase = tr_utils.Phase('val', model, val_loader, BUFFER, *args, no_new_weighting=True if 'nnw' in model_save_dir else False) # needed for early stopping best_val_acc = -1 if not model_is_resumed else best_val_acc_ best_val_acc_bac2 = -1 best_epoch = 0 if not model_is_resumed else best_epoch_ epochs_without_improvement = 0 if not model_is_resumed else epochs_without_improvement_ if model_is_resumed: old_metrics = utils.load_metrics(model_save_dir) # merge metrics h.METRIC = old_metrics['metric'] train_phase.metric = h.METRIC val_phase.metric = h.METRIC train_iterations_done = old_metrics['train_losses'].shape[0] val_iterations_done = old_metrics['val_losses'].shape[0] epochs_done = old_metrics['val_accs'].shape[0] if epochs_finished_old is not None: epochs_done_old = epochs_done epochs_done = epochs_done if epochs_finished > epochs_done else epochs_finished train_iterations_done = int(train_iterations_done / epochs_done_old) * epochs_done val_iterations_done = int(val_iterations_done / epochs_done_old) * epochs_done train_phase.losses = old_metrics['train_losses'].tolist()[:train_iterations_done] train_phase.accs = old_metrics['train_accs'].tolist()[:epochs_done] val_phase.losses = old_metrics['val_losses'].tolist()[:val_iterations_done] val_phase.accs = old_metrics['val_accs'].tolist()[:epochs_done] val_phase.accs_bac2 = old_metrics['val_accs_bac2'].tolist()[:epochs_done] val_phase.class_accs = old_metrics['val_class_accs'].tolist()[:epochs_done] val_phase.class_accs_bac2 = old_metrics['val_class_accs_bac2'].tolist()[:epochs_done] val_phase.class_scene_accs = old_metrics['val_class_scene_accs'].tolist()[:epochs_done] val_phase.class_scene_accs_bac2 = old_metrics['val_class_scene_accs_bac2'].tolist()[:epochs_done] val_phase.scene_accs = old_metrics['val_scene_accs'].tolist()[:epochs_done] val_phase.scene_accs_bac2 = old_metrics['val_scene_accs_bac2'].tolist()[:epochs_done] train_phase.sens_spec_class_scene = old_metrics['train_sens_spec_class_scene'].tolist()[:epochs_done] val_phase.sens_spec_class_scene = old_metrics['val_sens_spec_class_scene'].tolist()[:epochs_done] val_phase.sens_spec_class = old_metrics['val_sens_spec_class'].tolist()[:epochs_done] if 'global_gradient_norm' in old_metrics: train_phase.global_gradient_norms = old_metrics['global_gradient_norm'].tolist()[ :train_iterations_done] best_val_acc = np.max(val_phase.accs) best_val_acc_bac2 = old_metrics['val_accs_bac2'][np.argmax(val_phase.accs)] # set the dataloaders to correct epoch train_phase.resume_from_epoch(h.epochs_finished[val_fold - 1] + 1) val_phase.resume_from_epoch(h.epochs_finished[val_fold - 1] + 1) stage_was_finished = True loss_is_nan = False for e in range(h.epochs_finished[val_fold - 1], h.MAX_EPOCHS): # early stopping if epochs_without_improvement >= h.PATIENCE_IN_EPOCHS and h.PATIENCE_IN_EPOCHS > 0: break else: stage_was_finished = False train_loss_is_nan, _ = train_phase.run() val_loss_is_nan, _ = val_phase.run() if train_loss_is_nan or val_loss_is_nan: loss_is_nan = True print('\n\n\n---------------------------------------\n\n\n') print("ERROR: Training loss is NaN.") print('\n\n\n---------------------------------------\n\n\n') break tr_utils.update_latest_model_ckp(model_ckp_last, model_save_dir, e, val_phase.accs[-1]) tr_utils.update_best_model_ckp(model_ckp_best, model_save_dir, e, val_phase.accs[-1]) metrics = { 'metric': h.METRIC, 'train_losses': np.array(train_phase.losses), 'train_accs': np.array(train_phase.accs), 'val_losses': np.array(val_phase.losses), 'val_accs': np.array(val_phase.accs), 'val_accs_bac2': np.array(val_phase.accs_bac2), 'val_class_accs': np.array(val_phase.class_accs), 'val_class_accs_bac2': np.array(val_phase.class_accs_bac2), 'val_class_scene_accs': np.array(val_phase.class_scene_accs), 'val_class_scene_accs_bac2': np.array(val_phase.class_scene_accs_bac2), 'val_scene_accs': np.array(val_phase.scene_accs), 'val_scene_accs_bac2': np.array(val_phase.scene_accs_bac2), 'train_sens_spec_class_scene': np.array(train_phase.sens_spec_class_scene), 'val_sens_spec_class_scene': np.array(val_phase.sens_spec_class_scene), 'val_sens_spec_class': np.array(val_phase.sens_spec_class) } if GLOBAL_GRADIENT_NORM_PLOT: metrics['global_gradient_norm'] = np.array(train_phase.global_gradient_norms) utils.pickle_metrics(metrics, model_save_dir) if val_phase.accs[-1] > best_val_acc: best_val_acc = val_phase.accs[-1] best_val_acc_bac2 = val_phase.accs_bac2[-1] epochs_without_improvement = 0 best_epoch = e + 1 else: epochs_without_improvement += 1 hcm.finish_epoch(ID, h, val_phase.accs[-1], best_val_acc, val_phase.accs_bac2[-1], best_val_acc_bac2, val_fold - 1, e + 1, best_epoch, (timer() - start) / 60) if INTERMEDIATE_PLOTS: plot.plot_metrics(metrics, model_save_dir) if GLOBAL_GRADIENT_NORM_PLOT: plot.plot_global_gradient_norm(np.array(train_phase.global_gradient_norms), model_save_dir, epochs_done=e + 1) del metrics if not loss_is_nan: if not stage_was_finished: best_val_class_accuracies_over_folds[val_fold - 1] = val_phase.class_accs[best_epoch - 1] best_val_acc_over_folds[val_fold - 1] = val_phase.accs[best_epoch - 1] best_val_class_accuracies_over_folds_bac2[val_fold - 1] = val_phase.class_accs_bac2[best_epoch - 1] best_val_acc_over_folds_bac2[val_fold - 1] = val_phase.accs_bac2[best_epoch - 1] ################################################# CROSS VALIDATION: MEAN AND VARIANCE best_val_class_accs_over_folds = np.array(best_val_class_accuracies_over_folds) best_val_accs_over_folds = np.array(best_val_acc_over_folds) best_val_class_accs_over_folds_bac2 = np.array(best_val_class_accuracies_over_folds_bac2) best_val_accs_over_folds_bac2 = np.array(best_val_acc_over_folds_bac2) metrics_over_folds = utils.create_metrics_over_folds_dict(best_val_class_accs_over_folds, best_val_accs_over_folds, best_val_class_accs_over_folds_bac2, best_val_accs_over_folds_bac2) if h.STAGE > 1: metrics_over_folds_old = utils.load_metrics(model_dir) best_val_class_accs_over_folds += metrics_over_folds_old['best_val_class_accs_over_folds'] best_val_accs_over_folds += metrics_over_folds_old['best_val_acc_over_folds'] best_val_class_accs_over_folds_bac2 += metrics_over_folds_old['best_val_class_accs_over_folds_bac2'] best_val_accs_over_folds_bac2 += metrics_over_folds_old['best_val_acc_over_folds_bac2'] metrics_over_folds = utils.create_metrics_over_folds_dict(best_val_class_accs_over_folds, best_val_accs_over_folds, best_val_class_accs_over_folds_bac2, best_val_accs_over_folds_bac2) utils.pickle_metrics(metrics_over_folds, model_dir) if INTERMEDIATE_PLOTS: plot.plot_metrics(metrics_over_folds, model_dir) hcm.finish_stage(ID, h, metrics_over_folds['best_val_acc_mean_over_folds'], metrics_over_folds['best_val_acc_std_over_folds'], metrics_over_folds['best_val_acc_mean_over_folds_bac2'], metrics_over_folds['best_val_acc_std_over_folds_bac2'], timer() - start) else: metrics_over_folds = utils.load_metrics(model_dir) # STAGE thresholds stage_thresholds = {1: 0.81, 2: 0.81, 3: np.inf} # 3 is the last stage if metrics_over_folds['best_val_acc_mean_over_folds'] >= stage_thresholds[h.STAGE]: go_to_next_stage = True if go_to_next_stage: hcm.next_stage(ID, h) else: if h.STAGE == 3 or stage_thresholds[h.STAGE] != np.inf: hcm.finish_hcomb(ID, h) return go_to_next_stage else: hcm.finish_hcomb(ID, h) return False
# log['args'] = args # log['style_names'] = styles[:args.nb_classes] # log['style_image_sizes'] = style_sizes # log['total_loss'] = [] # log['style_loss'] = {k: [] for k in args.style_layers} # log['content_loss'] = {k: [] for k in args.content_layers} # log['tv_loss'] = [] # save paths chkpt_path = args.checkpoint_path log_dir = args.log_dir weights_path = os.path.splitext(args.checkpoint_path)[0] + "_weights.h5" # checkpoints model_checkpoint = ModelCheckpoint(chkpt_path, monitor='total_loss') model_checkpoint.set_model(pastiche_net) # tensorboard if (log_dir): if not os.path.exists(log_dir): os.makedirs(log_dir) tensorboard = TensorBoard(log_dir=log_dir) tensorboard.set_model(pastiche_net) start_time = time.time() for it in range(args.num_iterations): if batch_idx >= batches_per_epoch: print('Epoch done. Going back to the beginning...') batch_idx = 0 # Get the batch
def train_model(model, data, config, include_tensorboard): model_history = History() model_history.on_train_begin() saver = ModelCheckpoint(full_path(config.model_file()), verbose=1, save_best_only=True, period=1) saver.set_model(model) early_stopping = EarlyStopping(min_delta=config.min_delta, patience=config.patience, verbose=1) early_stopping.set_model(model) early_stopping.on_train_begin() csv_logger = CSVLogger(full_path(config.csv_log_file())) csv_logger.on_train_begin() if include_tensorboard: tensorborad = TensorBoard(histogram_freq=10, write_images=True) tensorborad.set_model(model) else: tensorborad = Callback() epoch = 0 stop = False while(epoch <= config.max_epochs and stop == False): epoch_history = History() epoch_history.on_train_begin() valid_sizes = [] train_sizes = [] print("Epoch:", epoch) for dataset in data.datasets: print("dataset:", dataset.name) model.reset_states() dataset.reset_generators() valid_sizes.append(dataset.valid_generators[0].size()) train_sizes.append(dataset.train_generators[0].size()) fit_history = model.fit_generator(dataset.train_generators[0], dataset.train_generators[0].size(), nb_epoch=1, verbose=0, validation_data=dataset.valid_generators[0], nb_val_samples=dataset.valid_generators[0].size()) epoch_history.on_epoch_end(epoch, last_logs(fit_history)) train_sizes.append(dataset.train_generators[1].size()) fit_history = model.fit_generator(dataset.train_generators[1], dataset.train_generators[1].size(), nb_epoch=1, verbose=0) epoch_history.on_epoch_end(epoch, last_logs(fit_history)) epoch_logs = average_logs(epoch_history, train_sizes, valid_sizes) model_history.on_epoch_end(epoch, logs=epoch_logs) saver.on_epoch_end(epoch, logs=epoch_logs) early_stopping.on_epoch_end(epoch, epoch_logs) csv_logger.on_epoch_end(epoch, epoch_logs) tensorborad.on_epoch_end(epoch, epoch_logs) epoch+= 1 if early_stopping.stopped_epoch > 0: stop = True early_stopping.on_train_end() csv_logger.on_train_end() tensorborad.on_train_end({})