samplefile=samplepath, after_n_batches=100, batchsize=100000, on_epoch_end=False, use_event=ev)) model, history = train.trainModel( nepochs=1, run_eagerly=True, batchsize=nbatch, batchsize_use_sum_of_squares=False, checkperiod=1, # saves a checkpoint model every N epochs verbose=verbosity, backup_after_batches=100, additional_callbacks=callbacks + [ CyclicLR( base_lr=learningrate / 5., max_lr=learningrate * 5., step_size=20) ]) loss_config.energy_loss_weight = 0.01 loss_config.position_loss_weight = 0.01 learningrate = 3e-5 model, history = train.trainModel( nepochs=1 + 3, run_eagerly=True, batchsize=nbatch, batchsize_use_sum_of_squares=False, checkperiod=1, # saves a checkpoint model every N epochs verbose=verbosity, backup_after_batches=100, additional_callbacks=callbacks + [
def train_flow(self, dir_path, model_type, num_class, epoch, batch_size=128, lr=0.01, es=True, decay_lr=False, clr=False, tl=True): """ A function to train the data on fly :param dir_path: the directory of where the images store :param model_type: the type of the model architecture, available options are 'vgg16', 'vgg19', 'resnet50' and 'xception' :param num_class: number of classes :param epoch: number of epochs to train the model :param batch_size: batch size for mini-batch training :param lr: learning rate - if cyclical learning rate is used, it's the minimum learning rate - if step decay learning rate is used, it's the initial learning rate :param es: whether to use early stopping or not :param decay_lr: whether to use learning rate decay :param clr: whether to use cyclical learning rate :param tl: whether to use transfer learning(fine-tuning all layers) or not :return: the trained model """ # if not apply transfer learning if tl == False: load_weights = None # if apply transfer learning, load weights from ImageNet if tl == True: load_weights = 'imagenet' if model_type.lower() == 'vgg16': self.img_shape = (224, 224) base_model = VGG16(include_top=False, weights=load_weights, input_shape=(224, 224, 3)) elif model_type.lower() == 'vgg19': self.img_shape = (224, 224) base_model = VGG19(include_top=False, weights=load_weights, input_shape=(224, 224, 3)) elif model_type.lower() == 'resnet50': self.img_shape = (224, 224) base_model = ResNet50(include_top=False, weights=load_weights, input_shape=(224, 224, 3)) elif model_type.lower() == 'xception': self.img_shape = (299, 299) base_model = Xception(include_top=False, weights=load_weights, input_shape=(299, 299, 3)) else: raise ValueError("Error: model name not valid!") self.batch_size = batch_size x = base_model.output # flatten the output x = Flatten()(x) # the number of units in the dense layer is 1024 x = Dense(1024, activation="relu")(x) x = Dropout(0.5)(x) predictions = Dense(num_class, activation="softmax", name='new_dense_layer')(x) model = Model(input=base_model.input, output=predictions) optimizer = optimizers.SGD(lr=lr, momentum=0.9) model.compile(loss="categorical_crossentropy", metrics=["accuracy"], optimizer=optimizer) # data augmentation # create generator for augmenting training data train_datagen = ImageDataGenerator(featurewise_center=True, zoom_range=0.1, shear_range=0.6, rescale=1. / 255, rotation_range=6) # fit the train_datagen (compute statistics for pre-processing) with some sample training data sample_train = self.sample_train(dir_path, 100) train_datagen.fit(sample_train) test_datagen = ImageDataGenerator(rescale=1. / 255) train_generator = train_datagen.flow_from_directory( dir_path + 'train', target_size=self.img_shape, batch_size=batch_size) valid_generator = test_datagen.flow_from_directory( dir_path + 'validation', target_size=self.img_shape, batch_size=batch_size) # Check point: save the model with the best accuracy model_path = self.name + '_model.h5' check_point = ModelCheckpoint(model_path, monitor='val_acc', save_best_only=True, mode='max') callback_list = [check_point] # if decay learning rate scheduler is used if decay_lr == True: lrate = LearningRateScheduler(self.step_decay) callback_list.append(lrate) # if clr = True, use Cyclical Learning rate if clr == True: clr_stepsize = 2 * math.ceil(37882 / batch_size) clr_triangular = CyclicLR(mode='triangular', base_lr=lr, max_lr=6 * lr, step_size=clr_stepsize) callback_list.append(clr_triangular) # if es == True, use Early Stoppinp if es == True: early_stop = EarlyStopping(monitor='val_acc', patience=10, mode='max') callback_list.append(early_stop) model.fit_generator(train_generator, validation_data=valid_generator, epochs=epoch, callbacks=callback_list) # get a map from real label to prediction label_map = (train_generator.class_indices) # swap value and key, map from prediction to real label label_map = dict((v, k) for k, v in label_map.items()) # store the label map with open('label_map.json', 'w') as fp: json.dump(label_map, fp) return model
for i in range(itercount): train_indices, val_indices = train_test_split(np.arange(len(meta_parameters_dictionary['train_labels'])), test_size = 0.1) if early: early_test_indices = [] for j in range(len(meta_parameters_dictionary['test_labels'])): if meta_parameters_dictionary['test_labels_stage'][j,0]==1 or meta_parameters_dictionary['test_labels_stage'][j,1]==1: early_test_indices.append(j) test_indices = range(len(meta_parameters_dictionary['test_labels'])) meta_parameters_dictionary['train_indices']=train_indices meta_parameters_dictionary['val_indices']=val_indices meta_parameters_dictionary['test_indices'] = np.array(test_indices) training_generator = data(meta_parameters_dictionary,batch_size,True,False) val_generator = data(meta_parameters_dictionary,batch_size,False,True) test_generator = data(meta_parameters_dictionary,batch_size,False,False) csv_logger = CSVLogger(os.path.join(LOGDIR,'training_{}.log'.format(test_cohort))) lrate = CyclicLR(base_lr=0.001,max_lr= 0.01,step_size=100,mode='triangular2') checkpointer = ExponentialMovingAverage(filepath=checkpoint_dir+'cyclic_{}_{}.h5'.format(test_cohort,i),save_best_only=True, save_weights_only=True,custom_objects={'cox_regression_loss':cox_regression_loss},verbose=1) lr_monitor = LambdaCallback(on_epoch_begin=lambda epoch, logs:print(tf.eval(model.optimizer.lr))) lr_callback = ReduceLROnPlateau(monitor='val_loss',factor=0.5,patience=2, min_lr = 0.00001) model = get_model(cube_size, clinical_features_size,kernel_size = (3,3,3)) history = model.fit_generator(training_generator, verbose =2, epochs=steps, callbacks=[lr_callback,lr_monitor,lrate,csv_logger,checkpointer],validation_data= val_generator,workers=8, use_multiprocessing=True, shuffle=True) print(i) try: model.load_weights(checkpoint_dir+'cyclic_{}_{}.h5'.format(test_cohort,i)) except OSError: print('Could not find checkpoint:'+ checkpoint_dir+'cyclic_{}_{}.h5'.format(test_cohort,i)) continue #tensorboard_callback = TensorBoard(log_dir=LOGDIR, histogram_freq=0, write_graph=True) #early_stopping_monitor = EarlyStopping(monitor='val_loss', patience=10) #weights = model.get_weights()
metrics=None, #clipnorm=0.01 ) model, history = train.trainModel( nepochs=1, run_eagerly=True, batchsize=nbatch, extend_truth_list_by=len(train.keras_model.outputs) - 2, #just adapt truth list to avoid keras error (no effect on model) batchsize_use_sum_of_squares=False, checkperiod=1, # saves a checkpoint model every N epochs verbose=verbosity, backup_after_batches=100, additional_callbacks=[ CyclicLR(base_lr=learningrate / 3., max_lr=learningrate, step_size=50) ] + cb) print("freeze BN") for l in train.keras_model.layers: if 'FullOCLoss' in l.name: l.use_average_cc_pos = False l.q_min = 0.5 l.beta_loss_scale = 3. #also stop GravNetLLLocalClusterLoss* from being evaluated learningrate /= 10. nbatch = 180000 train.compileModel(learningrate=learningrate, loss=None, metrics=None)
xtrain_fold = np.vstack((xtrain_fold,xtrain_pseudo,xtrain_flip_pseudo)) ytrain_fold = np.vstack((ytrain_fold,ytrain_pseudo,ytrain_pseudo)) xtrain_fold, ytrain_fold = shuffle(xtrain_fold, ytrain_fold) xvalid_fold = xtrain[vidxs,:] yvalid_fold = ytrain[vidxs,:] train_size = ytrain_fold.shape[0] valid_size = yvalid_fold.shape[0] train_steps = np.ceil(float(train_size) / float(BATCH_SIZE)) valid_steps = np.ceil(float(valid_size) / float(BATCH_SIZE)) print('TRAIN SIZE: %d VALID SIZE: %d'%(train_size, valid_size)) WEIGHTS_BEST = 'weights/best_weight_pseudo_part%d_fold%d.hdf5'%(part, fold) clr = CyclicLR(base_lr=1e-7, max_lr=2e-4, step_size=4*train_steps, mode='exp_range',gamma=0.99994) early_stopping = EarlyStopping(monitor='val_acc', patience=20, verbose=1, mode='max') save_checkpoint = ModelCheckpoint(WEIGHTS_BEST, monitor = 'val_acc', verbose = 1, save_weights_only = True, save_best_only=True, mode='max') callbacks = [save_checkpoint, early_stopping, clr] model = Model() model.summary() model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=2e-4), metrics=['accuracy']) model.fit(xtrain_fold, ytrain_fold, batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=1, validation_data=(xvalid_fold, yvalid_fold), callbacks=callbacks, shuffle=True) model.load_weights(WEIGHTS_BEST) ptest += model.predict(xtest, batch_size=BATCH_SIZE, verbose=1) ptest += model.predict(xtest_flip, batch_size=BATCH_SIZE, verbose=1)
early_stoping = EarlyStopping(monitor='val_acc', patience=8, verbose=1) save_checkpoint = ModelCheckpoint(WEIGHTS_BEST, monitor='val_acc', verbose=1, save_best_only=True, save_weights_only=True, mode='max') reduce_lr = ReduceLROnPlateau(monitor='val_acc', factor=0.2, patience=4, min_lr=1e-8, verbose=1) csv_logger = CSVLogger(TRAINING_LOG, append=True) clr = CyclicLR(base_lr=1e-8, max_lr=4e-5, step_size=2000., mode='exp_range', gamma=0.99994) callbacks_warmup = [save_checkpoint, csv_logger] callbacks_clr = [early_stoping, save_checkpoint, clr, csv_logger] callbacks = [early_stoping, save_checkpoint, reduce_lr, csv_logger] model = InceptionV3_Model() # warm up for layer in model.layers[0:-3]: layer.trainable = False model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=8e-5), metrics=['accuracy'])
# ) # # ) # # unfix # train.keras_model = fixLayersContaining(train.keras_model, "batch_normalization") # train.keras_model = fixLayersContaining(train.keras_model, "bn_") train.compileModel(learningrate=1e-4, loss=None) # print('frozen:') # for l in train.keras_model.layers: # if not l.trainable: # print(l.name) # 0/0 # train.saveModel('jan.h5') # # 0/0 model, history = train.trainModel( nepochs=10, run_eagerly=True, batchsize=nbatch, batchsize_use_sum_of_squares=False, checkperiod=1, # saves a checkpoint model every N epochs verbose=verbosity, backup_after_batches=100, additional_callbacks=callbacks + [CyclicLR(base_lr=learningrate, max_lr=learningrate * 2., step_size=100)])
# configure some hyper parameters INIT_LR = 5e-3 EPOCHS = 100 BATCH_SIZE = 96 #STEPS_PER_EPOCH = 320, VALIDATION_STEPS = 64 # add cyclical learning rate callback MIN_LR = 1e-7 MAX_LR = 1e-2 CLR_METHOD = "triangular" STEP_SIZE = 4 clr = CyclicLR(mode=CLR_METHOD, base_lr=MIN_LR, max_lr=MAX_LR, step_size=(STEP_SIZE * (np.shape(face_train)[0] // BATCH_SIZE))) # add checkpoint to save the network and stop if training doesn't improve filepath = "../best_weights_" + timestamp + ".hdf5" checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min') earlystop = EarlyStopping(monitor='val_loss', patience=50) callbacks_list = [checkpoint, earlystop, clr] # compile complete model with optmizer and print summary on screen optim = SGD(lr=INIT_LR, momentum=0.9) model.compile(optimizer=optim, loss='mean_squared_error', metrics=['mae'])
def train(self, train, valid, checkPath, epochs=200, factor=0.8, batch_size=32, tensorboardPath=None, lim_lr=0.0009, scheduler_mode=None, iteration=None): try: if tensorboardPath == None: t, h = os.path.split(checkPath) tensorboardPath = os.path.normpath(t + "//tensorboard//" + str(h)) patience_stop = 100 patience_reduce = 3 self.factor = factor earlystop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience_stop) checkpointer = keras.callbacks.ModelCheckpoint(filepath=checkPath, verbose=1, save_best_only=True) tboard = LRTensorBoard(tensorboardPath) if scheduler_mode == None: reduce_lr = keras.callbacks.ReduceLROnPlateau( monitor='val_loss', factor=float(factor), patience=patience_reduce, min_delta=0.2, min_lr=lim_lr) else: str_info = ' scheduler_mode: ' + str(scheduler_mode) + ',' str_info += ' base_lr: ' + str(self.start_lr) + ',' str_info += ' max_lr ' + str( self.start_lr * float(factor)) + ',' str_info += ' step_size: ' + str(int(2 * iteration)) print(termcolor.colored(str_info, "yellow")) if iteration == None: raise Exception("Number of iteration is unknown") reduce_lr = CyclicLR(base_lr=self.start_lr, max_lr=self.start_lr * float(factor), mode=scheduler_mode, step_size=2 * iteration) callback_list = [earlystop, checkpointer, tboard, reduce_lr] self.model.fit_generator( generator=train, validation_data=valid, epochs=epochs, use_multiprocessing=True, callbacks=callback_list, ) del self.model except: raise Exception("train is failed")
def main(): start = time.time() ap = argparse.ArgumentParser() ap.add_argument("-e", "--epochs", required=True, type=int, help="Number of epochs", default=25) ap.add_argument("-m", "--model_name", required=True, type=str, help="Imagenet model to train", default="xception") ap.add_argument("-b", "--batch_size", required=True, type=int, help="Batch size", default=8) ap.add_argument("-im_size", "--image_size", required=True, type=int, help="Batch size", default=224) args = ap.parse_args() # Training dataset loading train_data = np.load("train_data.npy") train_label = np.load("train_label.npy") encoder = LabelEncoder() encoder.fit(train_label) encoded_y = encoder.transform(train_label) Y = utils.to_categorical(encoded_y) print("Dataset Loaded...") # Train and validation split trainX, valX, trainY, valY = train_test_split(train_data, Y, test_size=0.1, shuffle=True, random_state=42, stratify=Y) print(trainX.shape, valX.shape, trainY.shape, valY.shape) # Train nad validation image data generator trainAug = ImageDataGenerator( rescale=1.0 / 255.0, preprocessing_function=get_random_eraser(p=0.5, s_l=0.02, s_h=0.4, r_1=0.3, r_2=1 / 0.3, v_l=0, v_h=255, pixel_level=False), rotation_range=30, zoom_range=0.15, width_shift_range=0.2, height_shift_range=0.2, shear_range=0.15, horizontal_flip=True, fill_mode="nearest", ) valAug = ImageDataGenerator(rescale=1.0 / 255.0) model = cnn_model(args.model_name, img_size=args.image_size) # Number of trainable and non-trainable parameters trainable_count = int( np.sum([K.count_params(p) for p in set(model.trainable_weights)])) non_trainable_count = int( np.sum([K.count_params(p) for p in set(model.non_trainable_weights)])) print("Total params: {:,}".format(trainable_count + non_trainable_count)) print("Trainable params: {:,}".format(trainable_count)) print("Non-trainable params: {:,}".format(non_trainable_count)) if not exists("./trained_wts"): makedirs("./trained_wts") if not exists("./training_logs"): makedirs("./training_logs") if not exists("./plots"): makedirs("./plots") # Keras backend model_checkpoint = ModelCheckpoint( "trained_wts/" + args.model_name + ".hdf5", monitor="val_loss", verbose=1, save_best_only=True, save_weights_only=True, ) stopping = EarlyStopping(monitor="val_loss", patience=10, verbose=0) clr = CyclicLR(mode=CLR_METHOD, base_lr=MIN_LR, max_lr=MAX_LR, step_size=STEP_SIZE * (trainX.shape[0] // args.batch_size)) print("Training is going to start in 3... 2... 1... ") # Model Training H = model.fit_generator( trainAug.flow(trainX, trainY, batch_size=args.batch_size), steps_per_epoch=len(trainX) // args.batch_size, validation_data=valAug.flow(valX, valY), validation_steps=len(valX) // args.batch_size, epochs=args.epochs, callbacks=[model_checkpoint], ) # plot the training loss and accuracy plt.style.use("ggplot") plt.figure() N = args.epochs plt.plot(np.arange(0, N), H.history["loss"], label="train_loss") plt.plot(np.arange(0, N), H.history["val_loss"], label="val_loss") plt.plot(np.arange(0, N), H.history["accuracy"], label="train_acc") plt.plot(np.arange(0, N), H.history["val_accuracy"], label="val_acc") plt.title("Training Loss and Accuracy") plt.xlabel("Epoch #") plt.ylabel("Loss/Accuracy") plt.legend(loc="lower left") plt.savefig("plots/training_plot.png") N = np.arange(0, len(clr.history["lr"])) plt.figure() plt.plot(N, clr.history["lr"]) plt.title("Cyclical Learning Rate (CLR)") plt.xlabel("Training Iterations") plt.ylabel("Learning Rate") plt.savefig("plots/cyclic_lr.png") end = time.time() dur = end - start if dur < 60: print("Execution Time:", dur, "seconds") elif dur > 60 and dur < 3600: dur = dur / 60 print("Execution Time:", dur, "minutes") else: dur = dur / (60 * 60) print("Execution Time:", dur, "hours")
def _main(): # argument parsing parser = argparse.ArgumentParser( description='Trains an image similarity detector.') parser.add_argument('--training-images-dir', type=str, help='The directory containing the training images' 'input files (JSON).') parser.add_argument( '--validation-images-dir', type=str, default=None, help='The directory containing the validation images input files.' 'If not specified, than no validation is performed (default behavior).' ) parser.add_argument('--images-dir', type=str, help='The root of the images directory.') parser.add_argument( '--output-dir', type=str, help='The output directory where the checkpoints will be stored.') parser.add_argument('--restart-checkpoint', type=str, default=None, help='The checkpoint from which to restart.') parser.add_argument( '--image-size', type=int, default=224, help='The image size in pixels, default is 224 (meaning 224x224).') parser.add_argument( '--preload-images', type=int, default=0, help= 'Preload (cache) images before starting training, 0 if not needed, else: number of bytes ' 'to load in cache.') parser.add_argument('--greyscale', type=int, default=0, help='If set to 1, converts images to greyscale.') parser.add_argument('--batch-size', type=int, default=24, help='The training minibatch size.') parser.add_argument('--loss-batch', type=int, default=4, help='The loss minibatch size.') parser.add_argument( '--backbone', type=str, default='mobilenetv2', help='The network backbone: mobilenetv2 (default), densenet121') parser.add_argument('--freeze-backbone', type=int, default=0, help='If set to 1, freeze the backbone.') parser.add_argument( '--feature-len', type=int, default=128, help= 'If larger than 0, a 1x1 convolution is added that converts the backbone output features ' 'to a layer with depth equal to --feature-len.') parser.add_argument( '--margin', type=float, default=0.4, help='The margin for the triple loss (default is 0.4).') parser.add_argument( '--soft', type=int, default=0, help='If set to 1, use soft margins when computing loss.') parser.add_argument( '--metric', type=str, default='euclidian', help= 'The distance metric: Euclidian (euclidian) or binary cross-entropy (binaryce). By ' 'fedault it is Euclidian.') parser.add_argument( '--max-lr', type=float, default=1e-4, help='The maximum (and also initial) learning rate (1e-4 by default).') parser.add_argument('--min-lr', type=float, default=1e-5, help='The minimum learning rate (1e-5 by default).') parser.add_argument( '--lr-schedule', type=str, default='cosine', help='The learning rate schedule: cosine (default), cyclic.') parser.add_argument( '--lr-schedule-cycle', type=int, default=100000, help='The lerning rate cycle length (number of images).') parser.add_argument('--images-per-epoch', type=int, default=10000, help='The number of images per epoch.') parser.add_argument('--start-epoch', type=int, default=1, help='The starting epoch (1 by default).') parser.add_argument('--end-epoch', type=int, default=5000, help='The ending epoch (5000 by default).') parser.add_argument('--checkpoint-name', type=str, default='chkpt', help='The root of the checkpoint names.') parser.add_argument( '--checkpoint-freq', type=int, default=100, help='The frequency of checkpoints in epochs. Default is 100.') parser.add_argument( '--early-stopping-patience', type=int, default=-1, help= 'The number of epoch to wait before stopping if the validation loss does not decrease. ' 'Set to -1 to disable (default)') parser.add_argument( '--no-aug-prob', type=float, default=0.2, help='The probability that an image is not augmented at all.') parser.add_argument('--crop-prob', type=float, default=0.0, help='The crop probability (0.05 by default).') parser.add_argument( '--crop-frac', type=float, default=0.09, help='The maximum fraction of area cropped-out (0.16 by default).') parser.add_argument('--fill-letterbox', type=int, default=0, help='Fill the letterbox (for small images') parser.add_argument('--jitter-prob', type=float, default=0.2, help='The jitter probability (0.2 by default') parser.add_argument('--jitter', type=float, default=0.1, help='The jitter size (0.1 by default).') parser.add_argument('--rotation-prob', type=float, default=0.0, help='The rotation probability.') parser.add_argument('--rotation-angle', type=float, default=0.0, help='The maximum rotation angle.') parser.add_argument( '--rotation-expand-prob', type=float, default=0, help= 'Probability to expand the image when rotating to not lose anything.') parser.add_argument('--scale-prob', type=float, default=0.1, help='The rescaling probability.') parser.add_argument('--scale-min', type=float, default=1.0, help='The minimum image rescaling factor.') parser.add_argument('--scale-max', type=float, default=1.0, help='The maximum image rescaling factor.') parser.add_argument( '--hflip', type=float, default=0.0, help='The horizontal flip probability (0.0 by default).') parser.add_argument('--no-colour-transforms', type=int, default=0, help='Do not transform colors.') parser.add_argument('--vflip', type=float, default=0.0, help='The vertical flip probability (0.0 by default).') parser.add_argument( '--hue', type=float, default=0.05, help='The hue variation (ignored for siamese backbone).') parser.add_argument( '--sat', type=float, default=0.2, help='The saturation variation (ignored for siamese backbone).') parser.add_argument( '--val', type=float, default=0.2, help='The value variation (ignored for siamese backbone).') parser.add_argument( '--mlflow', type=int, default=0, help='Set to 1 if using MLflow. Metrics and artifacts will be logged.') args = parser.parse_args() # start the mlflow autologging if args.mlflow: import mlflow.keras mlflow.keras.autolog() # create the training image list train_data = load_data(args.training_images_dir, verbose=False) train_imgs, train_cache = preload_images(train_data, 4, args.images_dir, args.preload_images) train_parents = list(train_imgs.keys()) np.random.shuffle(train_parents) train_lens = {} for k, v in train_imgs.items(): cur_len = len(v) if cur_len in train_lens: train_lens[cur_len] += 1 else: train_lens[cur_len] = 1 train_lens = pd.DataFrame(train_lens, index=[0]) print("Training length distribution:") print(train_lens) if args.validation_images_dir: do_valid = True val_data = load_data(args.validation_images_dir, verbose=False) val_imgs, val_cache = preload_images(val_data, 4, args.images_dir, args.preload_images) val_parents = list(val_imgs.keys()) np.random.shuffle(val_parents) else: do_valid = False print('There are {} training images.'.format(len(train_imgs))) if do_valid: print('There are {} validation images.'.format(len(val_imgs))) # create the output directory if necessary if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # scale the larning rate to the batch size max_lr = args.max_lr min_lr = args.min_lr # create the model num_channels = 1 if args.backbone == 'siamese' else 3 encoder = create_model((args.image_size, args.image_size, num_channels), restart_checkpoint=args.restart_checkpoint, backbone=args.backbone, feature_len=args.feature_len, freeze=args.freeze_backbone == 1) # compile the model with the initial learning rate bh_loss = Lambda(batch_hard_loss, output_shape=(1, ), name='batch_hard', arguments={ 'loss_batch': args.loss_batch, 'loss_margin': args.margin, 'soft': args.soft == 1, 'metric': args.metric })(encoder.output) model = Model(encoder.input, bh_loss) model.compile(loss={ 'batch_hard': lambda y_true, y_pred: y_pred }, optimizer=Adam(lr=max_lr)) print(model.summary()) print('Loss metric: {}'.format(args.metric)) if args.soft == 1: print('Using soft margins.') # prepare the callbacks info_lr = lr_info(model, args.mlflow == 1) # learning rate true_batch_size = args.batch_size // args.loss_batch print( 'Scaling the learning rate minimum to {} and maximum (initial) to {}'. format(min_lr, max_lr)) if args.lr_schedule == 'cosine': print('Using the cosine annealing learning rate scheduler.') lr_callback = CosineAnnealingScheduler( max_lr, true_batch_size, args.lr_schedule_cycle, min_lr=min_lr, verbose=True, initial_counter=(args.start_epoch - 1) * args.images_per_epoch) else: lr_callback = CyclicLR(mode='triangular', max_lr=max_lr, base_lr=min_lr, step_size=args.lr_schedule_cycle // true_batch_size) # checkpoints checkpoint = MyModelCheckpoint( filepath=os.path.join(args.output_dir, args.checkpoint_name + '_' + '{epoch:04d}'), snapshot_path=os.path.join(args.output_dir, args.checkpoint_name + '-snapshot'), model_body=None, encoder=encoder, save_best_only=do_valid, period=args.checkpoint_freq, verbose=1, mlflow=args.mlflow == 1) callbacks = [info_lr, lr_callback, checkpoint] if do_valid and args.early_stopping_patience != -1: callbacks.append( EarlyStopping(monitor='val_loss', patience=args.early_stopping_patience)) # train print('Batch configuration:') print('Loss batch: {}'.format(args.loss_batch)) print('Positives + anchors: {}'.format(args.loss_batch // 4)) print('Negatives: {}'.format(args.loss_batch - args.loss_batch // 4)) print('Effective minibatch: {}'.format(true_batch_size)) print('Encoder minibatch: {}'.format(args.batch_size)) augment = { 'scale_prob': args.scale_prob, 'scale_min': args.scale_min, 'scale_max': args.scale_max, 'crop_prob': args.crop_prob, 'crop_frac': args.crop_frac, 'jitter_prob': args.jitter_prob, 'jitter': args.jitter, 'rotate_prob': args.rotation_prob, 'rotate_angle': args.rotation_angle, 'rotate_expand_prob': args.rotation_expand_prob, 'hflip_prob': args.hflip, 'vflip_prob': args.vflip } if args.no_colour_transforms == 0: augment['hue']: args.hue augment['saturation']: args.sat augment['value']: args.val train_generator = data_generator( train_imgs, train_parents, args.batch_size, args.loss_batch, (args.image_size, args.image_size, num_channels), args.no_aug_prob, augment=augment, greyscale=args.greyscale == 1, fill_letterbox=args.fill_letterbox == 1, cache=train_cache) if do_valid: val_generator = data_generator( val_imgs, val_parents, args.batch_size, args.loss_batch, (args.image_size, args.image_size, num_channels), args.no_aug_prob, augment=augment, greyscale=args.greyscale == 1, fill_letterbox=args.fill_letterbox == 1, cache=val_cache) else: val_generator = None model.fit_generator( train_generator, steps_per_epoch=max(1, args.images_per_epoch // true_batch_size), validation_data=val_generator, validation_steps=max(1, args.images_per_epoch // true_batch_size), epochs=args.end_epoch, initial_epoch=args.start_epoch - 1, callbacks=callbacks)
def get_training_param_img( hypa: ty.Dict[str, str], use_validation: bool, model_path: ty.Optional[Path], num_samples: int, ) -> ty.Dict[str, ty.Any]: """MAKEDOC: what is get_training_param_img doing?""" logg = logging.getLogger(f"c.{__name__}.get_training_param_img") # logg.setLevel("INFO") # logg.debug("Start get_training_param_img") training_param: ty.Dict[str, ty.Any] = {} training_param["batch_size"] = int(hypa["batch_size_type"]) training_param["epochs"] = int(hypa["epoch_num_type"]) # translate from short key to long name learning_rate_types = { "01": "fixed01", "02": "fixed02", "03": "exp_decay_step_01", "04": "exp_decay_smooth_01", "05": "clr_triangular2_01", "06": "clr_triangular2_02", } learning_rate_type = hypa["learning_rate_type"] lr_name = learning_rate_types[learning_rate_type] training_param["lr_name"] = lr_name if lr_name.startswith("fixed"): if lr_name == "fixed01": lr = 1e-3 elif lr_name == "fixed02": lr = 1e-4 else: lr = 1e-3 optimizer_types = { "a1": Adam(learning_rate=lr), "r1": RMSprop(learning_rate=lr) } training_param["opt"] = optimizer_types[hypa["optimizer_type"]] callbacks = [] if lr_name.startswith("exp_decay"): if lr_name == "exp_decay_step_01": exp_decay_part = partial(exp_decay_step, epochs_drop=5) elif lr_name == "exp_decay_smooth_01": exp_decay_part = partial(exp_decay_smooth, epochs_drop=5) lrate = LearningRateScheduler(exp_decay_part) callbacks.append(lrate) # setup cyclic learning rate elif lr_name.startswith("clr_triangular2"): # target_cycles = the number of cycles we want in those epochs # it_per_epoch = num_samples // batch_size # total_iterations = it_per_epoch * epoch_num # step_size = total_iterations // target_cycles if lr_name == "clr_triangular2_01": target_cycles = 2 it_per_epoch = num_samples // training_param["batch_size"] total_iterations = it_per_epoch * training_param["epochs"] step_size = total_iterations // (target_cycles * 2) base_lr = 1e-5 max_lr = 1e-3 elif lr_name == "clr_triangular2_02": target_cycles = 8 it_per_epoch = num_samples // training_param["batch_size"] total_iterations = it_per_epoch * training_param["epochs"] step_size = total_iterations // (target_cycles * 2) base_lr = 1e-6 max_lr = 1e-3 logg.debug(f"target_cycles: {target_cycles}") logg.debug(f"it_per_epoch: {it_per_epoch}") logg.debug(f"total_iterations: {total_iterations}") logg.debug(f"num_samples: {num_samples}") logg.debug(f"CLR is using step_size: {step_size}") mode = "triangular2" cyclic_lr = CyclicLR(base_lr, max_lr, step_size, mode) callbacks.append(cyclic_lr) # which metric to monitor for early_stop and model_checkpoint metric_to_monitor = "val_loss" if use_validation else "loss" if lr_name.startswith("fixed") or lr_name.startswith("exp_decay"): early_stop = EarlyStopping( monitor=metric_to_monitor, patience=4, restore_best_weights=True, verbose=1, ) callbacks.append(early_stop) # to inhibit checkpointing by passing None if model_path is not None: model_checkpoint = ModelCheckpoint(str(model_path), monitor=metric_to_monitor, verbose=1, save_best_only=True) callbacks.append(model_checkpoint) training_param["callbacks"] = callbacks return training_param
def init_callbacks(self, n_epochs, n_batches, **kwargs): from keras.callbacks import TerminateOnNaN, EarlyStopping, \ ReduceLROnPlateau, CSVLogger, TensorBoard from validation_checkpoint import ValidationCheckpoint from clr_callback import CyclicLR print('Initializing model callbacks') use_tensorboard = kwargs.pop('use_tensorboard', False) val_monitor = kwargs.pop('monitor', 'val_loss') callbacks = kwargs.pop('callbacks', []) # strides to test/save model during training test_period = kwargs.pop('test_period', 1) save_period = kwargs.pop('save_period', 0) # 0 = disable warmup_epoch = kwargs.pop('warmup_epoch', 3) random_state = kwargs.pop('random_state', 42) verbose = kwargs.pop('verbose', 1) test_ids = kwargs.pop('test_ids', []) step_lr = kwargs.pop('step_lr', None) clr_mult = kwargs.pop('clr_mult', 4) # total epochs before CLR changes signs # exit early if the last [stop_early] test scores are all worse than the best early_stop = int(n_epochs * 0.2) stop_early = kwargs.pop('stop_early', None) or early_stop stop_delta = optparams['stop_delta'] save_preds = kwargs.pop('save_preds', True) save_model = kwargs.pop('save_model', True) model_dir = self.model_dir initial_monitor = self.start_monitor initial_epoch = self.start_epoch # configure callbacks val_mode = 'auto' ctimestr = epoch2str(gettime()) train_logf = pathjoin( model_dir, 'training_log_%s_pid%d.csv' % (ctimestr, self.pid)) # if pathexists(train_logf) and pathsize(train_logf) != 0: # #ctimestr = epoch2str(gettime()) # #ctimestr = epoch2str(pathctime(train_logf)) # ctimestr = '1' # logf_base,logf_ext = splitext(train_logf) # old_logf = logf_base+'_'+ctimestr+logf_ext # print('Backing up existing log file "%s" to "%s"'%(train_logf,old_logf)) # os.rename(train_logf,old_logf) self.val_monitor = val_monitor self.save_preds = save_preds self.save_model = save_model self.save_period = save_period self.test_period = test_period self.stop_early = stop_early self.stop_delta = stop_delta self.test_ids = test_ids self.val_cb = ValidationCheckpoint(val_monitor=val_monitor, save_best_preds=save_preds, save_best_model=save_model, model_dir=model_dir, mode=val_mode, pid=self.pid, initial_monitor=initial_monitor, initial_epoch=initial_epoch, warmup_epoch=warmup_epoch, save_period=save_period, test_period=test_period, test_ids=test_ids, verbose=verbose) #self.val_cb = ModelCheckpoint(model_iterf,monitor=val_monitor,mode=val_mode, period=save_epoch, # save_best_only=True, save_weights_only=False, # verbose=False) step_lr = step_lr or int(n_batches * clr_mult) self.lr_cb = CyclicLR(base_lr=optparams['lr_min'], max_lr=optparams['lr_max'], step_size=step_lr) # else: # step_lr = step_lr or min(100,int(n_epochs*0.01)) # self.lr_cb = ReduceLROnPlateau(monitor=val_monitor, # mode=val_mode, # patience=step_lr, # min_lr=optparams['lr_min'], # factor=optparams['reduce_lr'], # epsilon=optparams['tol'], # verbose=verbose) self.es_cb = EarlyStopping(monitor=val_monitor, mode=val_mode, patience=stop_early, min_delta=stop_delta, verbose=verbose) self.tn_cb = TerminateOnNaN() self.cv_cb = CSVLogger(filename=train_logf, append=True) self.callbacks = callbacks + [ self.val_cb, self.lr_cb, self.es_cb, self.tn_cb, self.cv_cb ] if self.backend == 'tensorflow' and use_tensorboard: tb_batch_size = 32 tb_histogram_freq = 1 tb_embeddings_freq = 0 tb_log_dir = pathjoin(model_dir, 'tb_logs_pid%d' % self.pid) if not pathexists(tb_log_dir): os.makedirs(tb_log_dir) self.tb_cb = TensorBoard(log_dir=tb_log_dir, histogram_freq=tb_histogram_freq, batch_size=tb_batch_size, write_graph=True, write_grads=True, write_images=True, embeddings_freq=tb_embeddings_freq, embeddings_layer_names=None, embeddings_metadata=None) self.callbacks.append(self.tb_cb) elif self.backend != 'tensorflow' and use_tensorboard: print('Cannot use tensorboard with backend "%s"' % self.backend) use_tensorboard = False print('Initialized %d callbacks:' % len(self.callbacks), str(self.callbacks))
############################################################################### # Train the model ############################################################################### early_stopper = EarlyStopping(monitor='val_loss', verbose=1, patience=args.patience) model_checkpoint = ModelCheckpoint(args.model_path, monitor='val_loss', mode='min', save_best_only=True, verbose=1) callbacks = [early_stopper, model_checkpoint] if args.cyclical_learning_rate: callbacks.append( CyclicLR(base_lr=0.0005, max_lr=0.006, step_size=4 * STEPS_PER_EPOCH, mode='triangular2')) VAL_SUBSPLITS = 5 VALIDATION_STEPS = info.splits[ 'test'].num_examples // args.batch_size // VAL_SUBSPLITS model_history = model.fit(tfds.as_numpy(augmentedDataset()), epochs=args.max_epochs, steps_per_epoch=STEPS_PER_EPOCH, validation_steps=VALIDATION_STEPS, validation_data=tfds.as_numpy(validate_dataset), callbacks=callbacks) ############################################################################### # Load the best model snapshot and evaluate the quality ###############################################################################
def main(data_module, model_module, optimizer_module, filename, config, use_val=False): """Patch everything together.""" batch_size = config['train']['batch_size'] nb_epoch = config['train']['epochs'] today = datetime.datetime.now() datestring = today.strftime('%Y%m%d-%H%M-%S') # The data, shuffled and split between train and test sets: data = data_module.load_data(config) print("Data loaded.") X_train, y_train = data['x_train'], data['y_train'] X_train = data_module.preprocess(X_train) # Get use_val value if 'use_val' in config['train']: use_val = config['train']['use_val'] else: use_val = True # Get training / validation sets if use_val: X_test, y_test = data['x_val'], data['y_val'] else: X_test, y_test = data['x_test'], data['y_test'] X_val = data_module.preprocess(data['x_val']) X_train = np.append(X_train, X_val, axis=0) y_train = np.append(y_train, data['y_val'], axis=0) X_test = data_module.preprocess(X_test) # load hierarchy, if present if 'hierarchy_path' in config['dataset']: ret = handle_hierarchies(config, data_module, X_train, y_train, X_test, y_test) # hierarchy = ret['hierarchy'] X_train = ret['X_train'] y_train = ret['y_train'] X_test = ret['X_test'] y_test = ret['y_test'] nb_classes = data_module.n_classes logging.info("# classes = {}".format(data_module.n_classes)) img_rows = data_module.img_rows img_cols = data_module.img_cols img_channels = data_module.img_channels da = config['train']['data_augmentation'] # Convert class vectors to binary class matrices. Y_train = np_utils.to_categorical(y_train, nb_classes) Y_test = np_utils.to_categorical(y_test, nb_classes) # Y_train = Y_train.reshape((-1, 1, 1, nb_classes)) # For fcn # Y_test = Y_test.reshape((-1, 1, 1, nb_classes)) if 'smooth_train' in config['dataset']: Y_train = np.load(config['dataset']['smooth_train']) if 'smooth_test_path' in config['dataset']: Y_test = np.load(config['dataset']['smooth_test_path']) # Input shape depends on the backend if K.image_dim_ordering() == "th": input_shape = (img_channels, img_rows, img_cols) else: input_shape = (img_rows, img_cols, img_channels) model = model_module.create_model(nb_classes, input_shape, config) print("Model created") if 'initializing_model_path' in config['model']: init_model_path = config['model']['initializing_model_path'] if not os.path.isfile(init_model_path): logging.error( "initializing_model={} not found".format(init_model_path)) sys.exit(-1) init_model = load_model(init_model_path) layer_dict_init = dict([(layer.name, layer) for layer in init_model.layers]) layer_dict_model = dict([(layer.name, layer) for layer in model.layers]) for layer_name in layer_dict_model.keys(): if layer_name in layer_dict_init: print("\tLoad layer weights '{}'".format(layer_name)) weights = layer_dict_init[layer_name].get_weights() try: layer_dict_model[layer_name].set_weights(weights) except ValueError: print("\t\twrong shape - skip") logging.info("Done initializing") model.summary() optimizer = optimizer_module.get_optimizer(config) model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=["accuracy"]) print("Finished compiling") print("Building model...") es = EarlyStopping(monitor='val_acc', min_delta=0, patience=10, verbose=1, mode='auto') history_cb = History() callbacks = [es, history_cb] # remote, if 'checkpoint' in config['train'] and config['train']['checkpoint']: checkpoint_fname = os.path.basename(config['train']['artifacts_path']) if 'saveall' in config['train'] and config['train']['saveall']: checkpoint_fname = ("{}_{}.chk.{{epoch:02d}}.h5".format( checkpoint_fname, datestring)) save_best_only = False else: checkpoint_fname = "{}_{}.chk.h5".format(checkpoint_fname, datestring) save_best_only = True model_chk_path = os.path.join(config['train']['artifacts_path'], checkpoint_fname) model_chk_path = get_nonexistant_path(model_chk_path) checkpoint = ModelCheckpoint(model_chk_path, monitor="val_acc", save_best_only=save_best_only, save_weights_only=False) callbacks.append(checkpoint) if 'tensorboard' in config['train'] and config['train']['tensorboard']: tensorboard = TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True, write_images=True) callbacks.append(tensorboard) if 'remote' in config['train'] and config['train']['remote']: remote = RemoteMonitor(root='http://localhost:9000') callbacks.append(remote) if 'lr_reducer' in config['train'] and config['train']['lr_reducer']: lr_reducer = ReduceLROnPlateau(monitor='val_acc', factor=0.3, cooldown=0, patience=3, min_lr=0.5e-6, verbose=1) callbacks.append(lr_reducer) if 'clr' in config['train']: clr = CyclicLR(base_lr=config['train']['clr']['base_lr'], max_lr=config['train']['clr']['max_lr'], step_size=(config['train']['clr']['step_size'] * (X_train.shape[0] // batch_size)), mode=config['train']['clr']['mode']) callbacks.append(clr) if not da: print('Not using data augmentation.') if 'checkpoint' in config['train'] and config['train']['checkpoint']: model.save( model_chk_path.format(epoch=0).replace('.00.', '.00.a.')) t0 = time.time() model.fit(X_train, Y_train, batch_size=batch_size, epochs=nb_epoch, validation_data=(X_test, Y_test), shuffle=True, callbacks=callbacks) t1 = time.time() t2 = t1 epochs_augmented_training = 0 else: print('Using real-time data augmentation.') if 'hue_shift' in da: hsv_augmentation = (da['hue_shift'], da['saturation_scale'], da['saturation_shift'], da['value_scale'], da['value_shift']) else: hsv_augmentation = None # This will do preprocessing and realtime data augmentation: datagen = ImageDataGenerator( # set input mean to 0 over the dataset featurewise_center=da['featurewise_center'], # set each sample mean to 0 samplewise_center=da['samplewise_center'], # divide inputs by std of the dataset featurewise_std_normalization=False, # divide each input by its std samplewise_std_normalization=da['samplewise_std_normalization'], zca_whitening=da['zca_whitening'], # randomly rotate images in the range (degrees, 0 to 180) rotation_range=da['rotation_range'], # randomly shift images horizontally (fraction of total width) width_shift_range=da['width_shift_range'], # randomly shift images vertically (fraction of total height) height_shift_range=da['height_shift_range'], horizontal_flip=da['horizontal_flip'], vertical_flip=da['vertical_flip'], hsv_augmentation=hsv_augmentation, zoom_range=da['zoom_range'], shear_range=da['shear_range'], channel_shift_range=da['channel_shift_range']) # Compute quantities required for featurewise normalization # (std, mean, and principal components if ZCA whitening is applied). datagen.fit(X_train, seed=0) # Apply normalization to test data for i in range(len(X_test)): X_test[i] = datagen.standardize(X_test[i]) # Fit the model on the batches generated by datagen.flow(). steps_per_epoch = X_train.shape[0] // batch_size if 'checkpoint' in config['train'] and config['train']['checkpoint']: model.save( model_chk_path.format(epoch=0).replace('.00.', '.00.a.')) t0 = time.time() model.fit_generator(datagen.flow(X_train, Y_train, batch_size=batch_size), steps_per_epoch=steps_per_epoch, epochs=nb_epoch, validation_data=(X_test, Y_test), callbacks=callbacks) t1 = time.time() # Train one epoch without augmentation to make sure data distribution # is fit well loss_history = history_cb.history["loss"] epochs_augmented_training = len(loss_history) model.fit(X_train, Y_train, batch_size=batch_size, epochs=nb_epoch, validation_data=(X_test, Y_test), shuffle=True, callbacks=callbacks, initial_epoch=len(loss_history)) t2 = time.time() loss_history = history_cb.history["loss"] acc_history = history_cb.history["acc"] val_acc_history = history_cb.history["val_acc"] np_loss_history = np.array(loss_history) np_acc_history = np.array(acc_history) np_val_acc_history = np.array(val_acc_history) history_data = zip(list(range(1, len(np_loss_history) + 1)), np_loss_history, np_acc_history, np_val_acc_history) history_data = [(el[0], "%0.4f" % el[1], "%0.4f" % el[2], "%0.4f" % el[3]) for el in history_data] history_fname = os.path.basename(config['train']['artifacts_path']) history_fname = "{}_{}_history.csv".format(history_fname, datestring) csv_path = os.path.join(config['train']['artifacts_path'], history_fname) csv_path = get_nonexistant_path(csv_path) with open(csv_path, 'w') as fp: writer = csv.writer(fp, delimiter=',') writer.writerows([("epoch", "loss", "acc", "val_acc")]) writer.writerows(history_data) training_time = t1 - t0 readjustment_time = t2 - t1 print("wall-clock training time: {}s".format(training_time)) model_fn = os.path.basename(config['train']['artifacts_path']) model_fn = "{}_{}.h5".format(model_fn, datestring) model_fn = os.path.join(config['train']['artifacts_path'], model_fn) model_fn = get_nonexistant_path(model_fn) model.save(model_fn) # Store training meta data data = { 'training_time': training_time, 'readjustment_time': readjustment_time, 'HOST': platform.node(), 'epochs': len(history_data), 'epochs_augmented_training': epochs_augmented_training, 'config': config } meta_train_fname = os.path.join(config['train']['artifacts_path'], "train-meta_{}.json".format(datestring)) meta_train_fname = get_nonexistant_path(meta_train_fname) with open(meta_train_fname, 'w') as outfile: str_ = json.dumps(data, indent=4, sort_keys=True, separators=(',', ': '), ensure_ascii=False) outfile.write(str_)
#model.add(Dense(24,activation='relu')) model.add(Dense(12, activation='relu')) #model.add(Dropout(0.05)) #model.add(Flatten()) #model.add(Dropout(0.05)) model.add(Dense(2, activation='softmax')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', 'binary_crossentropy']) # initialize the cyclical learning rate callback print("[INFO] using '{}' method".format(config.CLR_METHOD)) clr = CyclicLR(mode=config.CLR_METHOD, base_lr=config.MIN_LR, max_lr=config.MAX_LR, step_size=config.STEP_SIZE * (X_train.shape[0] // config.BATCH_SIZE)) #fitting the model baseline_history = model.fit( X_train, y_train, epochs=config.NUM_EPOCHS, callbacks=[clr], batch_size=config.BATCH_SIZE, #steps_per_epoch=X_train.shape[0] // config.BATCH_SIZE, validation_data=(X_test, y_test), verbose=1) model.summary() pred_train = model.predict(X_train)
def train_attention(hypa: ty.Dict[str, str], force_retrain: bool, use_validation: bool) -> None: """MAKEDOC: what is train_attention doing?""" logg = logging.getLogger(f"c.{__name__}.train_attention") # logg.setLevel("INFO") logg.debug("Start train_attention") # build the model name model_name = build_attention_name(hypa, use_validation) logg.debug(f"model_name: {model_name}") # save the trained model here model_folder = Path("trained_models") / "attention" if not model_folder.exists(): model_folder.mkdir(parents=True, exist_ok=True) model_path = model_folder / f"{model_name}.h5" placeholder_path = model_folder / f"{model_name}.txt" # check if this model has already been trained if placeholder_path.exists(): if force_retrain: logg.warn("\nRETRAINING MODEL!!\n") else: logg.debug("Already trained") return # save info regarding the model training in this folder info_folder = Path("info") / "attention" / model_name if not info_folder.exists(): info_folder.mkdir(parents=True, exist_ok=True) # get the word list words = words_types[hypa["words_type"]] num_labels = len(words) # load data processed_folder = Path("data_proc") processed_path = processed_folder / f"{hypa['dataset_name']}" data, labels = load_processed(processed_path, words) # concatenate train and val for final train val_data = None if use_validation: x = data["training"] y = labels["training"] val_data = (data["validation"], labels["validation"]) logg.debug("Using validation data") else: x = np.concatenate((data["training"], data["validation"])) y = np.concatenate((labels["training"], labels["validation"])) logg.debug("NOT using validation data") # the shape of each sample input_shape = data["training"][0].shape # from hypa extract model param model_param = get_model_param_attention(hypa, num_labels, input_shape) batch_size_types = {"01": 32, "02": 16} batch_size = batch_size_types[hypa["batch_size_type"]] epoch_num_types = {"01": 15, "02": 30, "03": 2, "04": 4} epoch_num = epoch_num_types[hypa["epoch_num_type"]] # magic to fix the GPUs setup_gpus() model = AttentionModel(**model_param) # model.summary() metrics = [ tf.keras.metrics.CategoricalAccuracy(), tf.keras.metrics.Precision(), tf.keras.metrics.Recall(), ] learning_rate_types = { "01": "fixed01", "02": "fixed02", "03": "exp_decay_step_01", "04": "exp_decay_smooth_01", "05": "clr_triangular2_01", "06": "clr_triangular2_02", "07": "clr_triangular2_03", "08": "clr_triangular2_04", "09": "clr_triangular2_05", "10": "exp_decay_smooth_02", } learning_rate_type = hypa["learning_rate_type"] lr_value = learning_rate_types[learning_rate_type] # setup opt fixed lr values if lr_value.startswith("fixed"): if lr_value == "fixed01": lr = 1e-3 elif lr_value == "fixed02": lr = 1e-4 else: lr = 1e-3 optimizer_types = { "a1": Adam(learning_rate=lr), "r1": RMSprop(learning_rate=lr) } opt = optimizer_types[hypa["optimizer_type"]] model.compile( optimizer=opt, loss=tf.keras.losses.CategoricalCrossentropy(), metrics=metrics, ) # setup callbacks callbacks = [] # setup exp decay step / smooth if lr_value.startswith("exp_decay"): if lr_value == "exp_decay_step_01": exp_decay_part = partial(exp_decay_step, epochs_drop=5) elif lr_value == "exp_decay_smooth_01": exp_decay_part = partial(exp_decay_smooth, epochs_drop=5) elif lr_value == "exp_decay_smooth_02": exp_decay_part = partial(exp_decay_smooth, epochs_drop=5, initial_lrate=1e-2) lrate = LearningRateScheduler(exp_decay_part) callbacks.append(lrate) # setup cyclic learning rate if lr_value.startswith("clr_triangular2"): base_lr = 1e-5 max_lr = 1e-3 # training iteration per epoch = num samples // batch size # step size suggested = 2~8 * iterations if lr_value == "clr_triangular2_01": step_factor = 8 step_size = step_factor * x.shape[0] // batch_size elif lr_value == "clr_triangular2_02": step_factor = 2 step_size = step_factor * x.shape[0] // batch_size # target_cycles = the number of cycles we want in those epochs # it_per_epoch = num_samples // batch_size # total_iterations = it_per_epoch * epoch_num # step_size = total_iterations // target_cycles elif lr_value == "clr_triangular2_03": # the number of cycles we want in those epochs target_cycles = 4 it_per_epoch = x.shape[0] // batch_size total_iterations = it_per_epoch * epoch_num step_size = total_iterations // (target_cycles * 2) elif lr_value == "clr_triangular2_04": # the number of cycles we want in those epochs target_cycles = 2 it_per_epoch = x.shape[0] // batch_size total_iterations = it_per_epoch * epoch_num step_size = total_iterations // (target_cycles * 2) elif lr_value == "clr_triangular2_05": # the number of cycles we want in those epochs target_cycles = 2 it_per_epoch = x.shape[0] // batch_size total_iterations = it_per_epoch * epoch_num step_size = total_iterations // (target_cycles * 2) # set bigger starting value max_lr = 1e-2 logg.debug(f"x.shape[0]: {x.shape[0]}") logg.debug(f"CLR is using step_size: {step_size}") mode = "triangular2" cyclic_lr = CyclicLR(base_lr, max_lr, step_size, mode) callbacks.append(cyclic_lr) # setup early stopping if learning_rate_type in ["01", "02", "03", "04"]: metric_to_monitor = "val_loss" if use_validation else "loss" early_stop = EarlyStopping( monitor=metric_to_monitor, patience=4, restore_best_weights=True, verbose=1, ) callbacks.append(early_stop) # model_checkpoint = ModelCheckpoint( # model_name, # monitor="val_loss", # save_best_only=True, # ) # a dict to recreate this training # FIXME this should be right before fit and have epoch_num/batch_size/lr info recap: ty.Dict[str, ty.Any] = {} recap["words"] = words recap["hypa"] = hypa recap["model_param"] = model_param recap["use_validation"] = use_validation recap["model_name"] = model_name recap["version"] = "001" # logg.debug(f"recap: {recap}") recap_path = info_folder / "recap.json" recap_path.write_text(json.dumps(recap, indent=4)) results = model.fit( x, y, validation_data=val_data, epochs=epoch_num, batch_size=batch_size, callbacks=callbacks, ) results_recap: ty.Dict[str, ty.Any] = {} results_recap["model_name"] = model_name results_recap["results_recap_version"] = "002" # eval performance on the various metrics eval_testing = model.evaluate(data["testing"], labels["testing"]) for metrics_name, value in zip(model.metrics_names, eval_testing): logg.debug(f"{metrics_name}: {value}") results_recap[metrics_name] = value # compute the confusion matrix y_pred = model.predict(data["testing"]) cm = pred_hot_2_cm(labels["testing"], y_pred, words) # logg.debug(f"cm: {cm}") results_recap["cm"] = cm.tolist() # compute the fscore fscore = analyze_confusion(cm, words) logg.debug(f"fscore: {fscore}") results_recap["fscore"] = fscore # save the histories results_recap["history_train"] = { mn: results.history[mn] for mn in model.metrics_names } if use_validation: results_recap["history_val"] = { f"val_{mn}": results.history[f"val_{mn}"] for mn in model.metrics_names } # plot the cm fig, ax = plt.subplots(figsize=(12, 12)) plot_confusion_matrix(cm, ax, model_name, words, fscore) plot_cm_path = info_folder / "test_confusion_matrix.png" fig.savefig(plot_cm_path) plt.close(fig) # save the results res_recap_path = info_folder / "results_recap.json" res_recap_path.write_text(json.dumps(results_recap, indent=4)) # if cyclic_lr was used save the history if lr_value.startswith("clr_triangular2"): logg.debug(f"cyclic_lr.history.keys(): {cyclic_lr.history.keys()}") clr_recap = {} for metric_name, values in cyclic_lr.history.items(): clr_recap[metric_name] = list(float(v) for v in values) clr_recap_path = info_folder / "clr_recap.json" clr_recap_path.write_text(json.dumps(clr_recap, indent=4)) # save the trained model model.save(model_path) placeholder_path.write_text(f"Trained. F-score: {fscore}")
def train_test(model_type, data, labels, feature_names, percentile=[20, 40, 60, 80]): """ Train/test model using nested k-folds paradigm. Parameters ---------- model_type : string 'SVM' for support vector machine or 'DNN' for neural network. data : np.array features to use for training/testing of shape (n_samples, n_features) labels : np.array or list labels used for binary classification, ex: np.array of 1 = cannabis or 0 = control percentile : list list of percentiles to use in feature selection during grid search allows accommodatation of varying number of features feature_names : list list of feature names Returns ------- df_performance : pd.DataFrame data frame of hyperparameters and performance across folds all_tp : list list of the number of true positives in each fold all_tn : list list of the number of true negatives in each fold all_fp : list list of the number of false positives in each fold all_fn : list list of the number of false negatives in each fold top_features : list list of selected features following grid search of percentile y_test_plot : list list of test labels for each fold (for plotting performance) probas_plot : list list of prediction probabilites (for plotting performance) """ warnings.filterwarnings("ignore") # Log performance if model_type == 'SVM': col_header = ['Kernel', 'Gamma', 'Cost', 'Percentile', 'Sensitivity', 'Specificity', 'PPV', 'NPV'] elif model_type == 'DNN': col_header = ['Optimizer', 'Initializer', 'Decay', 'Batch Size', 'Activation 1', 'Activation 2', 'Percentile', 'Sensitivity', 'Specificity', 'PPV', 'NPV'] df_performance = pd.DataFrame(columns=col_header) all_tp = [] all_tn = [] all_fp = [] all_fn = [] # Plotting y_test_plot = [] probas_plot = [] # Feature importance top_features = [] # Define grid hyper-parameters if model_type == 'SVM': tuned_parameters = dict( anova__percentile = percentile, svc__kernel = ['rbf', 'sigmoid', 'poly'], svc__gamma = [2**g for g in range(-15, 4)], svc__C = [2**C for C in range(-5, 16)] ) elif model_type == "DNN": tuned_parameters = dict( anova__percentile = percentile, nn__optimizer = ['SGD', 'AdamW'], nn__init = ['glorot_normal', 'glorot_uniform'], nn__activation_1 = ['relu', 'sigmoid', 'tanh'], nn__activation_2 = ['relu', 'sigmoid', 'tanh'], nn__batch_size = [32, 64, 128, 256], nn__decay = [10.0**i for i in range(-10,-1) if i%2 == 1] ) # Cross-validation kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=2) inner_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2) loop = 1 folds = [] for train_indices, test_indices in kfold.split(data, labels): print(f'Fold {loop}') # Callbacks for neural net clr = CyclicLR(mode='triangular', base_lr=0.175, max_lr=0.9175, step_size=12) es = EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=0, mode='auto', baseline=None, restore_best_weights=True) # Inner performance lists TP = [] TN = [] FP = [] FN = [] # Split data X_train = [data[idx] for idx in train_indices] y_train = [labels[idx] for idx in train_indices] X_test = [data[idx] for idx in test_indices] y_test = [labels[idx] for idx in test_indices] # Apply mean and variance centering scaler = StandardScaler().fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) # Pipe feature selection and classifier together if model_type == 'SVM': anova = SelectPercentile(f_classif) svc = SVC(class_weight='balanced', probability=True) clf = Pipeline([('anova', anova), ('svc', svc)]) elif model_type == 'DNN': anova = ANOVASelection() # Modified SelectPercentile class nn = KerasClassifier(build_fn=create_model, epochs=1000, verbose=0) clf = Pipeline([('anova', anova), ('nn', nn)]) # Train model clf = GridSearchCV(clf, tuned_parameters, scoring='balanced_accuracy', n_jobs=-1, cv=inner_kfold) # A random grid search can speed up computation if an analysis hangs: # clf = RandomizedSearchCV(clf, tuned_parameters, n_iter=30, scoring='balanced_accuracy', # n_jobs=-1, cv=inner_kfold) if model_type == 'SVM': clf.fit(X_train, y_train) elif model_type == 'DNN': clf.fit(X_train, y_train, nn__callbacks=[clr, es]) # Determine top features from feature selection selection = SelectPercentile(f_classif, percentile=clf.best_estimator_[0].percentile).fit(X_train, y_train) top_indices = selection.get_support(indices=True) selected_features = [feature_names[idx] for idx in top_indices] top_features.append(selected_features) # Test model y_true, y_pred = y_test, clf.predict(X_test) # Evaluate performance for idx, y in enumerate(y_true): if y == 1.0 and y == y_pred[idx]: TP.append(1) elif y == 1.0 and y != y_pred[idx]: FN.append(1) elif y == 0.0 and y == y_pred[idx]: TN.append(1) elif y == 0.0 and y != y_pred[idx]: FP.append(1) if len(FP) != 0 and len(FN) != 0: # This is most likely sensitivity = len(TP)/(len(TP)+len(FN)) specificity = len(TN)/(len(TN)+len(FP)) NPV = len(TN)/(len(TN)+len(FN)) PPV = len(TP)/(len(TP)+len(FP)) elif len(FP) != 0 and len(FN) == 0: # Likely overfitting sensitivity = 1 specificity = len(TN)/(len(TN)+len(FP)) NPV = 1 PPV = len(TP)/(len(TP)+len(FP)) elif len(FP) == 0 and len(FN) != 0: # Likely overfitting sensitivity = len(TP)/(len(TP)+len(FN)) specificity = 1 PPV = 1 NPV = len(TN)/(len(TN)+len(FN)) if len(FP) == 0 and len(FN) == 0: # Perfect classification - yeah right... sensitivity = 1 specificity = 1 NPV = 1 PPV = 1 all_tp.append(len(TP)) all_tn.append(len(TN)) all_fp.append(len(FP)) all_fn.append(len(FN)) # Append to performance df df_row_to_add = [] if model_type == 'SVM': params = ['svc__kernel', 'svc__gamma', 'svc__C', 'anova__percentile'] elif model_type == 'DNN': params = ['nn__optimizer', 'nn__init', 'nn__decay', 'nn__batch_size', 'nn__activation_1', 'nn__activation_2', 'anova__percentile'] df_row_to_add = [clf.best_params_[param] for param in params] df_row_to_add.append(sensitivity) df_row_to_add.append(specificity) df_row_to_add.append(PPV) df_row_to_add.append(NPV) folds.append('Fold ' + str(loop)) df_performance = df_performance.append(pd.Series(df_row_to_add, index=df_performance.columns), ignore_index=True) df_performance.index = folds # For plotting y_test_plot.append(y_test) probas_ = clf.predict_proba(X_test) probas_plot.append(probas_) loop += 1 return df_performance, all_tp, all_tn, all_fp, all_fn, top_features, y_test_plot, probas_plot
def train(self): """ train VAE model """ train_datagen = ImageDataGenerator(rescale=1. / (2**self.image_res - 1), horizontal_flip=True, vertical_flip=True) # colormode needs to be set depending on num_channels if self.nchannel == 1: train_generator = train_datagen.flow_from_directory( self.data_dir, target_size=(self.image_size, self.image_size), batch_size=self.batch_size, color_mode='grayscale', class_mode='input') elif self.nchannel == 3: print('using three channel generator!') train_generator = train_datagen.flow_from_directory( self.data_dir, target_size=(self.image_size, self.image_size), batch_size=self.batch_size, color_mode='rgb', class_mode='input') else: # expecting data saved as numpy array train_generator = NumpyDataGenerator(self.data_dir, batch_size=self.batch_size, image_size=self.image_size, nchannel=self.nchannel, image_res=self.image_res, shuffle=True) # instantiate callbacks callbacks = [] term_nan = TerminateOnNaN() callbacks.append(term_nan) csv_logger = CSVLogger(os.path.join(self.save_dir, 'training.log'), separator='\t') callbacks.append(csv_logger) checkpointer = ModelCheckpoint(os.path.join( self.save_dir, 'checkpoints/vae_weights.hdf5'), verbose=1, save_best_only=True, save_weights_only=True) callbacks.append(checkpointer) if self.earlystop: earlystop = EarlyStopping(monitor='loss', min_delta=0, patience=8) callbacks.append(earlystop) if self.use_clr: clr = CyclicLR(base_lr=self.learn_rate, max_lr=0.0001, step_size=0.25 * self.steps_per_epoch, mode='triangular') callbacks.append(clr) if self.use_vaecb: vaecb = VAEcallback(self) callbacks.append(vaecb) self.history = self.vae.fit_generator( train_generator, epochs=self.epochs, callbacks=callbacks, steps_per_epoch=self.steps_per_epoch, verbose=self.verbose) print('saving model weights to', self.model_dir) self.vae.save_weights(os.path.join(self.model_dir, 'weights_vae.hdf5')) self.encoder.save_weights( os.path.join(self.model_dir, 'weights_encoder.hdf5')) self.decoder.save_weights( os.path.join(self.model_dir, 'weights_decoder.hdf5')) self.encode() print('done!')
def _main(): # argument parsing parser = argparse.ArgumentParser( description='Trains an image similarity detector.') parser.add_argument('--training-images-dir', type=str, help='The training images directory.') parser.add_argument( '--validation-images-dir', type=str, default=None, help= 'The validation images directory. If not specified, than no validation is performed (defualt behavior).' ) parser.add_argument( '--output-dir', type=str, help='The output directory where the checkpoints will be stored.') parser.add_argument('--restart-checkpoint', type=str, default=None, help='The checkpoint from which to restart.') parser.add_argument( '--image-size', type=int, default=224, help='The image size in pixels, default is 224 (meaning 224x224).') parser.add_argument('--batch-size', type=int, default=8, help='The training minibatch size.') parser.add_argument( '--feature-vector-len', type=int, default=1024, help='The length of the feature vector (1024 by default).') parser.add_argument('--use-l2', type=int, default=0, help='If set to 1, use L2 instead of L1 difference.') parser.add_argument( '--backbone', type=str, default='siamese', help='The network backbone: siamese(default), mobilenetv2, resnet50') parser.add_argument('--freeze-backbone', type=int, default=0, help='Set to 1 to freeze the backbone (0 by default).') parser.add_argument( '--max-lr', type=float, default=1e-4, help='The maximum (and also initial) learning rate (1e-4 by default).') parser.add_argument('--min-lr', type=float, default=1e-5, help='The minimum learning rate (1e-5 by default).') parser.add_argument( '--lr-schedule', type=str, default='cosine', help='The learning rate schedule: cosine (default), cyclic.') parser.add_argument( '--lr-schedule-cycle', type=int, default=100000, help='The lerning rate cycle length (number of images).') parser.add_argument('--images-per-epoch', type=int, default=10000, help='The number of images per epoch.') parser.add_argument('--start-epoch', type=int, default=1, help='The starting epoch (1 by default).') parser.add_argument('--end-epoch', type=int, default=5000, help='The ending epoch (5000 by default).') parser.add_argument('--checkpoint-name', type=str, default='chkpt', help='The root of the checkpoint names.') parser.add_argument( '--checkpoint-freq', type=int, default=100, help='The frequency of checkpoints in epochs. Default is 100.') parser.add_argument( '--early-stopping-patience', type=int, default=-1, help= 'The number of epoch to wait before stopping if the validation loss does not decrease. Set to -1 to disable (default)' ) parser.add_argument( '--same-prob', type=float, default=0.5, help='The probability of comparing to the same image (0.5 by default).' ) parser.add_argument( '--no-aug-prob', type=float, default=0.2, help='The probability that an image is not augmented at all.') parser.add_argument('--crop-prob', type=float, default=0.05, help='The crop probability (0.05 by default).') parser.add_argument( '--crop-frac', type=float, default=0.09, help='The maximum fraction of area cropped-out (0.16 by default).') parser.add_argument('--jitter-prob', type=float, default=0.2, help='The jitter probability (0.2 by default') parser.add_argument('--jitter', type=float, default=0.1, help='The jitter size (0.1 by default).') parser.add_argument('--rot', type=float, default=0.0, help='The rotation probability (0.0 by default).') parser.add_argument( '--hflip', type=float, default=0.0, help='The horizontal flip probability (0.0 by default).') parser.add_argument('--vflip', type=float, default=0.3, help='The vertical flip probability (0.0 by default).') parser.add_argument( '--hue', type=float, default=0.05, help='The hue variation (ignored for siamese backbone).') parser.add_argument( '--sat', type=float, default=0.2, help='The saturation variation (ignored for siamese backbone).') parser.add_argument( '--val', type=float, default=0.2, help='The value variation (ignored for siamese backbone).') parser.add_argument( '--mlflow', type=int, default=0, help='Set to 1 if using MLflow. Metrics and artifacts will be logged.') args = parser.parse_args() # start the mlflow autologging if args.mlflow: import mlflow.keras mlflow.keras.autolog() # create the image lists exts = ('.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG', '.gif', '.GIF', '.tiff', '.TIFF', '.TIF', '.bmp', '.BMP') train_imgs = [] train_dir_files = os.listdir(args.training_images_dir) for f in train_dir_files: if f.endswith(exts): train_imgs.append(os.path.join(args.training_images_dir, f)) np.random.shuffle(train_imgs) if args.validation_images_dir: do_valid = True val_imgs = [] val_dir_files = os.listdir(args.validation_images_dir) for f in val_dir_files: if f.endswith(exts): val_imgs.append(os.path.join(args.validation_images_dir, f)) np.random.shuffle(val_imgs) else: do_valid = False print('There are {} training images.'.format(len(train_imgs))) if do_valid: print('There are {} validation images.'.format(len(val_imgs))) # create the output directory if necessary if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # create the model from model import create_model num_channels = 1 if args.backbone == 'siamese' else 3 model, model_body, encoder = create_model( (args.image_size, args.image_size, num_channels), args.feature_vector_len, restart_checkpoint=args.restart_checkpoint, backbone=args.backbone, freeze=args.freeze_backbone == 1, l2=args.use_l2 == 1) print('\nThe model:') print(model.summary()) # prepare the callbacks from lr_info import lr_info info_lr = lr_info(model, args.mlflow == 1) # learning rate # scale the larning rate to the batch size max_lr = args.max_lr * np.sqrt(args.batch_size) min_lr = args.min_lr * np.sqrt(args.batch_size) print( 'Scaling the learning rate minimum to {} and maximum (initial) to {}'. format(min_lr, max_lr)) print( 'The original values are {} and {}, and are multiplied by the root of the batch size {}.' .format(args.min_lr, args.max_lr, args.batch_size)) if args.lr_schedule == 'cosine': print('Using the cosine annealing learning rate scheduler.') from cos_callback import CosineAnnealingScheduler lr_callback = CosineAnnealingScheduler( max_lr, args.batch_size, args.lr_schedule_cycle, min_lr=min_lr, verbose=True, initial_counter=(args.start_epoch - 1) * args.images_per_epoch // args.batch_size) else: from clr_callback import CyclicLR lr_callback = CyclicLR(model='triangular', max_lr=maxlr, base_lr=min_lr, step_size=args.lr_schedule_cycle // args.batch_size) # checkpoints from checkpoint import MyModelCheckpoint checkpoint = MyModelCheckpoint( filepath=os.path.join(args.output_dir, args.checkpoint_name + '_' + '{epoch:04d}'), snapshot_path=os.path.join(args.output_dir, args.checkpoint_name + '-snapshot'), model_body=model_body, encoder=encoder, save_best_only=do_valid, period=args.checkpoint_freq, verbose=1, mlflow=args.mlflow == 1) callbacks = [info_lr, lr_callback, checkpoint] if do_valid and args.early_stopping_patience != -1: from keras.callbacks import EarlyStopping callbacks.append( EarlyStopping(monitor='val_loss', patience=args.early_stopping_patience)) # compile the model with the initial learning rate from keras.optimizers import Adam model.compile(loss='binary_crossentropy', optimizer=Adam(lr=max_lr)) # train augment = { 'crop_prob': args.crop_prob, 'crop_frac': args.crop_frac, 'jitter_prob': args.jitter_prob, 'jitter': args.jitter, 'rot': args.rot, 'hflip': args.hflip, 'vflip': args.vflip, 'hue': args.hue, 'sat': args.sat, 'val': args.val } train_generator = data_generator( train_imgs, args.batch_size, (args.image_size, args.image_size, num_channels), args.same_prob, args.no_aug_prob, no_augment=False, augment=augment) if do_valid: val_generator = data_generator( val_imgs, args.batch_size, (args.image_size, args.image_size, num_channels), args.same_prob, args.no_aug_prob, no_augment=False, augment=augment) else: val_generator = None model.fit_generator( train_generator, steps_per_epoch=max(1, args.images_per_epoch // args.batch_size), validation_data=val_generator, validation_steps=max(1, args.images_per_epoch // args.batch_size), epochs=args.end_epoch, initial_epoch=args.start_epoch - 1, callbacks=callbacks)
# if schedule is not None: # callbacks = [LearningRateScheduler(schedule)] # decay = 0.0 # if args["schedule"] == "standard": # print("[INFO] using 'keras standard' learning rate decay...") # decay = 1e-1 / epochs # elif schedule is None: # print("[INFO] no learning rate schedule being used") # stepSize = config.STEP_SIZE * (train_images.shape[0] // config.BATCH_SIZE) file_path = "xception-hepatocyte.h5" es, msave, reduce_lr, tb_log, log_cv = get_callbacks(file_path, top_model, patience=10) clr = CyclicLR(mode=config.CLR_METHOD, base_lr=config.MIN_LR, max_lr=config.MAX_LR, step_size=stepSize) print("[INFO] training network...") H = top_model.fit_generator( train_datagen.flow(train_images, Y_train, batch_size=config.BATCH_SIZE), validation_data=valid_gen, steps_per_epoch=train_images.shape[0] // batch_size_for_generators, validation_steps=valid_images.shape[0] // batch_size_for_generators, epochs=config.NUM_EPOCHS, callbacks=[clr, msave, log_cv], verbose=1) print("[INFO] evaluating network...") predictions = top_model.predict(valid_images, batch_size=config.BATCH_SIZE) print( classification_report(Y_valid.argmax(axis=1), predictions.argmax(axis=1),
batchsize=100000, on_epoch_end=False, publish=publishpath + "_event_" + str(ev), use_event=ev)) loss_config.use_average_cc_pos = True model, history = train.trainModel( nepochs=1, run_eagerly=True, batchsize=nbatch, batchsize_use_sum_of_squares=False, checkperiod=1, # saves a checkpoint model every N epochs verbose=verbosity, backup_after_batches=100, additional_callbacks=callbacks + [CyclicLR(base_lr=learningrate, max_lr=learningrate * 10., step_size=10)]) loss_config.use_average_cc_pos = False loss_config.energy_loss_weight = 1e-1 loss_config.position_loss_weight = 1e-2 loss_config.timing_loss_weight = 1e-6 learningrate = 1e-5 loss_config.beta_loss_scale = 10. model, history = train.trainModel( nepochs=1 + 3, run_eagerly=True, batchsize=nbatch, batchsize_use_sum_of_squares=False, checkperiod=1, # saves a checkpoint model every N epochs
PRETRAINED_WEIGHTS = 'weights/pretrained_weights_fold%d_%s.hdf5' % ( fold, ftype) kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) for sub_fold, (train_index, valid_index) in enumerate(kf.split(x_valid, y_valid1)): x_train_fold, x_valid_fold = x_valid[train_index], x_valid[ valid_index] y_train_fold, y_valid_fold = y_valid[train_index], y_valid[ valid_index] WEIGHTS_BEST = 'weights/best_weights_fold%d_subfold%d_%s.hdf5' % ( fold, sub_fold, ftype) clr = CyclicLR(base_lr=1e-8, max_lr=8e-5) early_stoping = EarlyStopping(monitor='val_acc', patience=20, verbose=1) save_checkpoint = ModelCheckpoint(WEIGHTS_BEST, monitor='val_acc', verbose=1, save_best_only=True, save_weights_only=True, mode='max') callbacks = [early_stoping, save_checkpoint, clr] model = Stacking_Model() model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=8e-5), metrics=['accuracy'])
# Generators training_data = DataGenerator(subset='training', **params) #validation_generator = DataGenerator(subset='validation', **params) # define model model = model_from_yaml(open('models/' + params["model"] + '.yaml')) model.load_weights('models/' + params["model"] + '.h5', by_name=True) VARS = vars() model.compile('SGD', loss={k: VARS[v] for k, v in params['vars_loss'].items()}) # fit model model.fit( training_data, epochs=10000, #validation_data=validation_generator, #use_multiprocessing=True, #workers=6, callbacks=[ CyclicLR(mode='triangular', base_lr=0.000001, max_lr=0.01, step_size=params['steps']), keras.callbacks.ModelCheckpoint(filepath='models/' + params['model'] + '.h5', save_weights_only=True, monitor='loss', mode='min', save_best_only=True) ])
def get_lr_schedule(schedule, num_samples, batch_size, schedule_args={}): """ Creates a learning rate schedule. # Arguments: - schedule: Name of the schedule. Possible values: - 'sgd': Stochastic Gradient Descent with ReduceLROnPlateau or LearningRateSchedule callback. - 'sgdr': Stochastic Gradient Descent with Cosine Annealing and Warm Restarts. - 'clr': Cyclical Learning Rates. - 'resnet-schedule': Hand-crafted schedule used by He et al. for training ResNet. - num_samples: Number of training samples. - batch_size: Number of samples per batch. - schedule_args: Further arguments for the specific learning rate schedule. 'sgd' supports: - 'sgd_patience': Number of epochs without improvement before reducing the LR. Default: 10. - 'sgd_min_lr': Minimum learning rate. Default : 1e-4 - 'sgd_schedule': Comma-separated list of `epoch:lr` pairs, defining a learning rate schedule. The total number of epochs can be appended to this list, separated by a comma as well. If this is specified, the learning rate will not be reduced on plateaus automatically and `sgd_patience` and `sgd_min_lr` will be ignored. The following example would mean to train for 50 epochs, starting with a learning rate of 0.1 and reducing it by a factor of 10 after 30 and 40 epochs: "1:0.1,31:0.01,41:0.001,50". 'sgdr' supports: - 'sgdr_base_len': Length of the first cycle. Default: 12. - 'sgdr_mul': Factor multiplied with the length of the cycle after the end of each one. Default: 2. - 'sgdr_max_lr': Initial learning rate at the beginning of each cycle. Default: 0.1. 'clr' supports: - 'clr_step_len': Number of training epochs per half-cycle. Default: 12. - 'clr_min_lr': Minimum learning rate. Default: 1e-5. - 'clr_max_lr': Maximum learning rate: Default: 0.1. # Returns: - a list of callbacks for being passed to the fit function, - a suggested number of training epochs. """ if schedule.lower() == 'sgd': if ('sgd_schedule' in schedule_args) and ( schedule_args['sgd_schedule'] is not None) and (schedule_args['sgd_schedule'] != ''): def lr_scheduler(schedule, epoch, cur_lr): if schedule[0][0] > epoch: return cur_lr for i in range(1, len(schedule)): if schedule[i][0] > epoch: return schedule[i - 1][1] if schedule[ i - 1][1] is not None else cur_lr return schedule[-1][1] if schedule[-1][ 1] is not None else cur_lr schedule = [ (int(point[0]) - 1, float(point[1]) if len(point) > 1 else None) for sched_tuple in schedule_args['sgd_schedule'].split(',') for point in [sched_tuple.split(':')] ] schedule.sort() return [ keras.callbacks.LearningRateScheduler( lambda ep, cur_lr: lr_scheduler(schedule, ep, cur_lr)) ], schedule[-1][0] + 1 else: if 'sgd_patience' not in schedule_args: schedule_args['sgd_patience'] = 10 if 'sgd_min_lr' not in schedule_args: schedule_args['sgd_min_lr'] = 1e-4 return [ keras.callbacks.ReduceLROnPlateau( 'val_loss', patience=schedule_args['sgd_patience'], epsilon=1e-4, min_lr=schedule_args['sgd_min_lr'], verbose=True) ], 200 elif schedule.lower() == 'sgdr': if 'sgdr_base_len' not in schedule_args: schedule_args['sgdr_base_len'] = 12 if 'sgdr_mul' not in schedule_args: schedule_args['sgdr_mul'] = 2 if 'sgdr_max_lr' not in schedule_args: schedule_args['sgdr_max_lr'] = 0.1 return ([ SGDR(1e-6, schedule_args['sgdr_max_lr'], schedule_args['sgdr_base_len'], schedule_args['sgdr_mul']) ], sum(schedule_args['sgdr_base_len'] * (schedule_args['sgdr_mul']**i) for i in range(5))) elif schedule.lower() == 'clr': if 'clr_step_len' not in schedule_args: schedule_args['clr_step_len'] = 12 if 'clr_min_lr' not in schedule_args: schedule_args['clr_min_lr'] = 1e-5 if 'clr_max_lr' not in schedule_args: schedule_args['clr_max_lr'] = 0.1 return ([ CyclicLR(schedule_args['clr_min_lr'], schedule_args['clr_max_lr'], schedule_args['clr_step_len'] * (num_samples // batch_size), mode='triangular') ], schedule_args['clr_step_len'] * 20) elif schedule.lower() == 'resnet-schedule': def resnet_scheduler(epoch): if epoch >= 120: return 0.001 elif epoch >= 80: return 0.01 elif epoch >= 1: return 0.1 else: return 0.01 return [keras.callbacks.LearningRateScheduler(resnet_scheduler)], 164 else: raise ValueError('Unknown learning rate schedule: {}'.format(schedule))
train.compileModel(learningrate=learningrate, loss=None, metrics=None) # print(train.keras_model.) model, history = train.trainModel( nepochs=4, run_eagerly=True, batchsize=nbatch, extend_truth_list_by=len(train.keras_model.outputs) - 2, #just adapt truth list to avoid keras error (no effect on model) batchsize_use_sum_of_squares=False, checkperiod=1, # saves a checkpoint model every N epochs verbose=verbosity, backup_after_batches=100, additional_callbacks=[ CyclicLR(base_lr=learningrate / 3., max_lr=learningrate, step_size=20) ] + cb, ) #print("freeze BN") #for l in train.keras_model.layers: # if isinstance(l, BatchNormalization): # l.trainable=False # if 'GravNetLLLocalClusterLoss' in l.name: # l.active=False #also stop GravNetLLLocalClusterLoss* from being evaluated learningrate = 1e-4 train.compileModel(learningrate=learningrate, loss=None, metrics=None)