def make_submission_vgg19(name, name_ext, dropout_p, penultimate_size): data_info = load_organized_data_info(imgs_dim=HEIGHT, name=name) _, _, _, _, _, te_names = create_embeddings(name) batch_size = 32 datagen = ImageDataGenerator( preprocessing_function=preprocess_single_input) datagen = datagen.flow_from_directory(directory=data_info['dir_te'], target_size=(HEIGHT, WIDTH), class_mode=None, batch_size=batch_size, shuffle=False) model_file = join(MODELS_DIR, MODEL_FILE.format(name, name_ext)) model = VGG19(weights='imagenet', include_top=False, input_shape=(HEIGHT, WIDTH, 3)) top_classifier = _top_classifier(l2_reg=0, dropout_p=dropout_p, input_shape=(9, 9, 512), penultimate_size=penultimate_size) model = Model(inputs=model.input, outputs=top_classifier(model.output)) model.load_weights(model_file) probs_pred = model.predict_generator(generator=datagen, steps=ceil(data_info['num_te'] / batch_size)) submission_file = 'vgg19_fine_tuned_{:s}.csv'.format(name) create_submission_file(image_names=te_names, probs=probs_pred, file_name=join(SUBMISSIONS_DIR, submission_file))
def _create_submission_file_avg_cnns(): data_info = load_organized_data_info(IMGS_DIM_1D) model = _get_model('/home/chris/painters/models/') X_test, img_filenames = _average_embedded_test_data(model, data_info) features_lookup = {n: f for n, f in zip(img_filenames, X_test)} _create_submission_file(BATCH_SIZE, features_lookup, _calculate_batch_prediction_dot)
def _average_embedded_test_data(model, data_info): X_test, y_test = None, None data_info = load_organized_data_info(IMGS_DIM_1D) dir_te, num_te = data_info['dir_te'], data_info['num_te'] dir_tr = data_info['dir_tr'] gen = testing_generator(dir_tr=dir_tr) gen_test = init_directory_generator(gen, dir_te, BATCH_SIZE, class_mode='sparse', shuffle_=False) num_batch_per_epoch = num_te // BATCH_SIZE last_batch_size = num_te - (num_batch_per_epoch * BATCH_SIZE) for i in range(num_batch_per_epoch + 1): X_batch, y_batch = next(gen_test) if i == num_batch_per_epoch: X_batch = X_batch[:last_batch_size] if X_test is None: X_test = model.predict(X_batch) else: X_test = np.vstack((X_test, model.predict(X_batch))) # gen_test.filenames is ordered the same as X_test # (image file names with corresponding features). img_filenames = [basename(p) for p in gen_test.filenames] return X_test, img_filenames
def clean(imgs_dim=299, name=''): """Deletes all resized images datasests (i.e. train, val, test) and the info file. """ data_info = load_organized_data_info(imgs_dim, name) rmtree(data_info['dir_tr']) rmtree(data_info['dir_val']) rmtree(data_info['dir_te']) remove(organized_data_info_file(imgs_dim, name))
def __init__(self, hparams): super().__init__(hparams) self.kernel_size = pedl.get_hyperparameter("kernel_size") self.dropout = pedl.get_hyperparameter("dropout") self.pool_size = pedl.get_hyperparameter("pool_size") self.l2_reg = pedl.get_hyperparameter("l2_reg") self.lr = pedl.get_hyperparameter("lr") self.my_batch_size = pedl.get_hyperparameter("batch_size") self.data_info = load_organized_data_info(IMGS_DIM_1D)
def _train_model(): # Change IMGS_DIM_3D to 0 instead of index 2 because we reversed order for the tensorflow switch data_info = load_organized_data_info(IMGS_DIM_3D[0]) dir_tr = data_info['dir_tr'] dir_val = data_info['dir_val'] #just threw this in here from https://github.com/keras-team/keras/issues/8649 flagged in git issue 6. # currently it just saves every epoch. can probably modify to only save if highest accuracy. class MyCbk(Callback): def __init__(self, model): self.model_to_save = model def on_epoch_end(self, epoch, logs=None): self.model_to_save.save('model_at_epoch_%d.h5' % epoch) gen_tr, gen_val = train_val_dirs_generators(BATCH_SIZE, dir_tr, dir_val) # with tf.device('/cpu:0'): # important!!! # model = _cnn(IMGS_DIM_3D) # print('Single tower model:') # model.summary() # # if gpu_count > 1: # model = make_parallel(model, gpu_count) # # print('Multi-GPU model:') # model.summary() # # model = compile_model(model) # model.fit_generator( # generator=gen_tr, # epochs=MAX_EPOCHS, # steps_per_epoch=data_info['num_tr'], # validation_data=gen_val, # validation_steps=data_info['num_val'], # callbacks=[ModelCheckpoint(CNN_MODEL_FILE, save_best_only=True)], # verbose=1) model = _cnn(IMGS_DIM_3D) #new #model.load_weights("/home/nkim/art_cnn/models/cnn.h5") #print("Model weights have been updated!") parallel_model = multi_gpu_model(model, gpus=2) parallel_model = compile_model(parallel_model) cbk = MyCbk(model) #new parallel_model.fit_generator( generator=gen_tr, epochs=MAX_EPOCHS, steps_per_epoch=data_info['num_tr'], validation_data=gen_val, validation_steps=data_info['num_val'], callbacks=[cbk], #new verbose=1)
def make_data_loaders(experiment_config, hparams): # multi_crop improves training, but was not used for author's submission data_info = load_organized_data_info(IMGS_DIM_3D[1], multi_crop=True) dir_tr = data_info['dir_tr'] dir_val = data_info['dir_val'] gen_tr, gen_val = train_val_dirs_generators(BATCH_SIZE, dir_tr, dir_val) gen_tr = KerasDataAdapter(gen_tr, workers=16, use_multiprocessing=True) gen_val = KerasDataAdapter(gen_val, workers=16, use_multiprocessing=True) return (gen_tr, gen_val)
def stack(group): name, width, height = group['name'], group['width'], group['height'] group_uid, models = group['uid'], group['models'] meta_model_file = join( MODELS_DIR, 'stacking_meta_model_group_{:d}_{:s}.pickle'.format(group_uid, name)) meta_model_fitted = isfile(meta_model_file) data_info = load_organized_data_info(imgs_dim=width, name=name) if not meta_model_fitted: preds_val = np.empty((data_info['num_val'], 0)) preds_te = np.empty((data_info['num_te'], 0)) for model_name, preprocess_func in models: model_path = join(MODELS_DIR, model_name) if not meta_model_fitted: model_preds_val = _make_predictions( height=height, width=width, model_path=model_path, preprocess_func=preprocess_func, data_info=data_info, dir_id='val') model_preds_te = _make_predictions(height=height, width=width, model_path=model_path, preprocess_func=preprocess_func, data_info=data_info, dir_id='te') if not meta_model_fitted: preds_val = np.hstack((preds_val, model_preds_val)) preds_te = np.hstack((preds_te, model_preds_te)) _, _, _, y_val, _, te_names = create_embeddings(name=name) if meta_model_fitted: meta_model = load(meta_model_file) else: meta_model = LogisticRegression(C=1e10) meta_model.fit(preds_val, y_val) dump(meta_model, meta_model_file) te_pred = meta_model.predict_proba(preds_te) return te_names, te_pred
def train(model_file, reduce_lr_factor=1e-1, num_freeze_layers=0, epochs=10, name='', reg='l2', reg_strength=0.0, dropout=0.5, early_stopping=False): data_info = load_organized_data_info(imgs_dim=HEIGHT, name=name) _, X_tr, Y_tr = _get_tagged_images( data_info['dir_tr'], truncate_to_id=True) _, X_val, Y_val = _get_tagged_images( data_info['dir_val'], truncate_to_id=True) def _image_generator(generator, data, labels): return generator.flow( data, labels, batch_size=32, shuffle=True, ) model = _cnn( model_file=model_file, reg=reg, reg_strength=reg_strength, dropout_p=dropout ) model.compile(loss='mean_squared_error', optimizer='adam') # model has 134 layers for layer in model.layers[:num_freeze_layers]: layer.trainable = False callbacks = [ ReduceLROnPlateau(factor=reduce_lr_factor), ModelCheckpoint(_model_file_name( name, reg, reg_strength, dropout ), save_best_only=True), TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True), ] if early_stopping: callbacks.append(EarlyStopping( monitor='val_loss', min_delta=1, patience=early_stopping )) generator = ImageDataGenerator() model.fit_generator( generator=_image_generator(generator, X_tr, Y_tr), steps_per_epoch=len(X_tr), epochs=epochs, callbacks=callbacks, validation_data=_image_generator(generator, X_val, Y_val), validation_steps=len(X_val), )
def _softmax_dot(): data_info = load_organized_data_info(IMGS_DIM_1D) X_avg, y_val = _average_embedded_val_data(data_info) batches_val = _create_pairs_generator(X_avg, y_val, lambda u, v: [u, v], num_groups=32, batch_size=1000000) y_pred, y_true = np.array([]), np.array([]) for X, y in batches_val: y_pred = np.hstack((y_pred, pairs_dot(X))) y_true = np.hstack((y_true, y)) print("Validation AUC: {:.4f}".format(roc_auc_score(y_true, y_pred)))
def _train_model(): data_info = load_organized_data_info(IMGS_DIM_3D[1]) dir_tr = data_info['dir_tr'] dir_val = data_info['dir_val'] gen_tr, gen_val = train_val_dirs_generators(BATCH_SIZE, dir_tr, dir_val) model = _cnn(IMGS_DIM_3D) model.fit_generator( generator=gen_tr, nb_epoch=MAX_EPOCHS, samples_per_epoch=data_info['num_tr'], validation_data=gen_val, nb_val_samples=data_info['num_val'], callbacks=[ModelCheckpoint(CNN_MODEL_FILE, save_best_only=True)], verbose=2)
def _create_embedded_test_set(layer, model_path, test_set_file): data_info = load_organized_data_info(IMGS_DIM_1D) dir_te, num_te = data_info['dir_te'], data_info['num_te'] dir_tr = data_info['dir_tr'] model = LAYER_RESULT_FUNCS[layer](model_path) gen = testing_generator(dir_tr=dir_tr) X_te, names = _create_embedded_data_from_dir(model, gen, dir_te, num_te, LAYER_SIZES[layer], is_test_set=True) _save_np_compressed_data(test_set_file, X_te, names) return X_te, names
def _create_embedded_train_val_split(layer, model_path, train_val_split_file): data_info = load_organized_data_info(IMGS_DIM_1D) dir_tr, num_tr = data_info['dir_tr'], data_info['num_tr'] dir_val, num_val = data_info['dir_val'], data_info['num_val'] model = LAYER_RESULT_FUNCS[layer](model_path) gen = testing_generator(dir_tr=dir_tr) X_tr, y_tr, names_tr = _create_embedded_data_from_dir( model, gen, dir_tr, num_tr, LAYER_SIZES[layer]) X_val, y_val, names_val = _create_embedded_data_from_dir( model, gen, dir_val, num_val, LAYER_SIZES[layer]) _save_np_compressed_data(train_val_split_file, X_tr, y_tr, names_tr, X_val, y_val, names_val) return X_tr, y_tr, names_tr, X_val, y_val, names_val
def _train_model(): data_info = load_organized_data_info(IMGS_DIM_3D[1]) dir_tr = data_info['dir_tr'] dir_val = data_info['dir_val'] gen_tr, gen_val = train_val_dirs_generators(BATCH_SIZE, dir_tr, dir_val) model = _cnn(IMGS_DIM_3D) model.fit_generator( generator=gen_tr, epochs=MAX_EPOCHS, steps_per_epoch=300, validation_data=gen_val, validation_steps=math.ceil(data_info['num_val'] / BATCH_SIZE), validation_freq=10, callbacks=[ModelCheckpoint(CNN_MODEL_FILE, save_best_only=True)], workers=16, use_multiprocessing=True, verbose=1)
def resume_training(model_file, name='', reduce_lr_factor=1e-1, num_freeze_layers=0, epochs=10, reg='l2', reg_strength=0.0, dropout=0.5): data_info = load_organized_data_info(imgs_dim=HEIGHT, name=name) _, X_tr, Y_tr = _get_tagged_images( data_info['dir_tr'], truncate_to_id=True) _, X_val, Y_val = _get_tagged_images( data_info['dir_val'], truncate_to_id=True) def _image_generator(generator, data, labels): return generator.flow( data, labels, batch_size=32, shuffle=True, ) model = load_model(model_file) for layer in model.layers[:num_freeze_layers]: layer.trainable = False callbacks = [ ReduceLROnPlateau(factor=reduce_lr_factor), ModelCheckpoint(_model_file_name( name, reg, reg_strength, dropout ), save_best_only=True), TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True), ] generator = ImageDataGenerator() model.fit_generator( generator=_image_generator(generator, X_tr, Y_tr), steps_per_epoch=len(X_tr), epochs=epochs, callbacks=callbacks, validation_data=_image_generator(generator, X_val, Y_val), validation_steps=len(X_val), )
def create_embeddings(name): """Returns vgg16 embeddings (outputs of the last conv layer). Returns ------- tuple X_tr (n_samples, 9, 9, 512) y_tr (n_samples,) X_val (n_samples, 9, 9, 512) y_val (n_samples,) X_te (n_samples, 9, 9, 512) te_names (n_samples,) """ embeddings_file = join(DATA_DIR, EMBEDDINGS_FILE.format(name)) if isfile(embeddings_file): d = np.load(embeddings_file) return d['X_tr'], d['y_tr'], d['X_val'], d['y_val'], d['X_te'],\ d['te_names'] data_info = load_organized_data_info(imgs_dim=HEIGHT, name=name) datagen = ImageDataGenerator(preprocessing_function=preprocess_single_input) batch_size = 32 def dir_datagen(dir_): return datagen.flow_from_directory( directory=dir_, target_size=(HEIGHT, WIDTH), class_mode=None, batch_size=batch_size, shuffle=False ) model = VGG16( weights='imagenet', include_top=False, input_shape=(HEIGHT, WIDTH, 3) ) def embed(dir_, num, data_is_labeled): X = model.predict_generator( generator=dir_datagen(dir_), steps=ceil(num / batch_size) ) if data_is_labeled: num_per_cls = num_examples_per_class_in_dir(dir_) y_0 = np.zeros(num_per_cls['Type_1']) y_1 = np.zeros(num_per_cls['Type_2']) + 1 y_2 = np.zeros(num_per_cls['Type_3']) + 2 y = np.hstack((y_0, y_1, y_2)) return X, y # unlabeled (test) dataset names = [x for x in listdir(join(dir_, 'all')) if x.endswith('.jpg')] return X, np.array(names) dir_tr, num_tr = data_info['dir_tr'], data_info['num_tr'] X_tr, y_tr = embed(dir_tr, num_tr, data_is_labeled=True) dir_val, num_val = data_info['dir_val'], data_info['num_val'] X_val, y_val = embed(dir_val, num_val, data_is_labeled=True) dir_te, num_te = data_info['dir_te'], data_info['num_te'] X_te, te_names = embed(dir_te, num_te, data_is_labeled=False) np.savez_compressed( file=embeddings_file, X_tr=X_tr, y_tr=y_tr, X_val=X_val, y_val=y_val, X_te=X_te, te_names=te_names ) print("Embedded data shapes:") print("X_tr {0}".format(X_tr.shape)) print("y_tr {0}".format(y_tr.shape)) print("X_val {0}".format(X_val.shape)) print("y_val {0}".format(y_val.shape)) print("X_te {0}".format(X_te.shape)) print("te_names {0}".format(te_names.shape)) return X_tr, y_tr, X_val, y_val, X_te, te_names
def fine_tune(name, name_ext, lr=1e-4, reduce_lr_factor=0.1, reduce_lr_patience=3, epochs=10, batch_size=32, l2_reg=0, dropout_p=0.5, num_freeze_layers=0, save_best_only=True, loss_stop_val=0.00001, penultimate_size=256): data_info = load_organized_data_info(imgs_dim=HEIGHT, name=name) tr_datagen = ImageDataGenerator( preprocessing_function=preprocess_single_input, rotation_range=180, vertical_flip=True, horizontal_flip=True, # width_shift_range=0.1, # height_shift_range=0.1, # zoom_range=0.1, # shear_range=0.3, # fill_mode='reflect' ) val_datagen = ImageDataGenerator( preprocessing_function=preprocess_single_input ) batch_size = 32 def dir_datagen(dir_, gen): return gen.flow_from_directory( directory=dir_, target_size=(HEIGHT, WIDTH), class_mode='categorical', batch_size=batch_size, shuffle=True ) dir_tr, num_tr = data_info['dir_tr'], data_info['num_tr'] dir_val, num_val = data_info['dir_val'], data_info['num_val'] top_classifier_file = join( MODELS_DIR, TOP_CLASSIFIER_FILE.format(name, penultimate_size) ) model_file = join(MODELS_DIR, MODEL_FILE.format(name, name_ext)) model = VGG16( weights='imagenet', include_top=False, input_shape=(HEIGHT, WIDTH, 3) ) top_classifier = _top_classifier( l2_reg=l2_reg, dropout_p=dropout_p, input_shape=(9, 9, 512), penultimate_size=penultimate_size ) top_classifier.load_weights(top_classifier_file) model = Model(inputs=model.input, outputs=top_classifier(model.output)) model.compile(Adam(lr=lr), loss='categorical_crossentropy') # model has 20 layers for layer in model.layers[:num_freeze_layers]: layer.trainable = False log_dir = join(EXPERIMENTS_DIR, 'vgg16_fine_tuned_{:s}'.format(name)) callbacks = [ EarlyStoppingByLoss(monitor='loss', value=loss_stop_val), ReduceLROnPlateau(factor=reduce_lr_factor, patience=reduce_lr_patience), ModelCheckpoint(model_file, save_best_only=save_best_only), TensorBoard( log_dir=log_dir, write_graph=False ) ] model.fit_generator( generator=dir_datagen(dir_tr, tr_datagen), steps_per_epoch=ceil(num_tr / batch_size), epochs=epochs, validation_data=dir_datagen(dir_val, val_datagen), validation_steps=ceil(num_val / batch_size), callbacks=callbacks )
def _add_test_info_to_organized_data_info(imgs_dim, name, num_test_samples, new_dir_te): data_info = load_organized_data_info(imgs_dim, name) data_info['dir_te'] = dirname(new_dir_te) data_info['num_te'] = num_test_samples save_organized_data_info(data_info, imgs_dim, name)
def _create_submission_file_avg_cnns(): data_info = load_organized_data_info(IMGS_DIM_1D) X_avg, names = _average_embedded_test_data(data_info) features_lookup = {n: f for n, f in zip(names, X_avg)} _create_submission_file(BATCH_SIZE, features_lookup, _calculate_batch_prediction_dot)
def _append_num_te_to_organized_data_info(num_test_samples, multi_crop): data_info = load_organized_data_info(IMGS_DIM_2D[0], multi_crop=multi_crop) data_info['num_te'] = num_test_samples save_organized_data_info(data_info, IMGS_DIM_2D[0], multi_crop=multi_crop)