def extract_cnn_features(cnn_model, layer, features_dir): # prepare data augmentation configuration datagen = ImageDataGenerator(rescale=1. / 255) base_model = cnn_model.load(weights=cnn_model.weights) target_size = (cnn_model.img_height, cnn_model.img_width) layers_by_name = {l.name: l for l in base_model.layers} outputs = layers_by_name[layer].output model = Model(inputs=base_model.input, outputs=outputs) users = IO.load_annotations(ntcir.filepaths) for user_id, user in users.iteritems(): for date, day in user.iteritems(): for image in day.images: img = load_image(datagen, image.path, target_size) image.features = model.predict(img).copy() features_filepath = os.path.join(features_dir, "features." + cnn_model.name + ".pkl") with open(features_filepath, 'w') as f: pickle.dump(users, f, pickle.HIGHEST_PROTOCOL) del model if K.backend() == 'tensorflow': K.clear_session()
def extract_cnn_features(cnn_model, layers, features_dir, start_fold=1, end_fold=10): folds = [str(fold).zfill(2) for fold in range(start_fold, end_fold + 1)] for fold in folds: # prepare data augmentation configuration datagen = ImageDataGenerator(rescale=1. / 255) init_weights = cnn_model.weights.format(fold) base_model = cnn_model.load(weights=init_weights) target_size = (cnn_model.img_height, cnn_model.img_width) layers_by_name = {l.name: l for l in base_model.layers} outputs = [layers_by_name[l].output for l in layers] model = Model(inputs=base_model.input, outputs=outputs) users = IO.load_annotations(ntcir.filepaths) for user_id, user in users.iteritems(): for date, day in user.iteritems(): for image in day.images: img = load_image(datagen, image.path, target_size) predictions = model.predict(img) if len(model.output_layers) == 1: predictions = [predictions] image.features = {l: predictions[i].copy() for i, l in enumerate(layers)} features_filepath = os.path.join(features_dir, "features." + cnn_model.name + ".fold_" + fold + ".pkl") with open(features_filepath, 'w') as f: pickle.dump(users, f, pickle.HIGHEST_PROTOCOL) del model if K.backend() == 'tensorflow': K.clear_session()
def train(features_filepath, weights_dir, rf_model, start_fold=1, end_fold=1, timestep=5, progress_percent=0.05, iccv_epic=True, features_size=4096, cores=None): np.random.seed(42) users = IO.load_annotations(ntcir.filepaths) sorted_users = ntcir.utils.sort(users) num_frames_per_day = 2880 sequences = ntcir.get_sequences(sorted_users, num_frames_per_day) if not start_fold: start_fold = current_fold(weights_dir, rf_model.name) folds = [str(fold).zfill(2) for fold in range(start_fold, end_fold + 1)] for fold in folds: with open(features_filepath.format(fold), 'r') as f: user_features = pickle.load(f) if iccv_epic: train_split = ntcir.read_split('datasets/ntcir/training_split.txt') else: train_split = ntcir.get_split_fold(sorted_users, int(fold) - 1) training_batches = ntcir.get_training_batches(train_split, sequences, timestep=timestep) num_features = timestep*features_size num_training_batches = len(training_batches) # Extract features of the images features = np.zeros((num_training_batches, num_features)) targets = np.zeros(num_training_batches) if progress_percent: training_progress_percent = int(num_training_batches * progress_percent) print "Creating training matrix for fold {}".format(fold) for i, batch in enumerate(training_batches): day = user_features[batch.user_id][batch.date] for j, ind in enumerate(batch.indices): image = day.images[ind] start_ind = j * features_size end_ind = (j+1) * features_size features[i, start_ind:end_ind] = image.features last_ind = batch.indices[-1] targets[i] = day.images[last_ind].label if progress_percent and (i + 1) % training_progress_percent == 0: print("Progress %3.2f%% (%d/%d)" % ((i + 1) / num_training_batches * 100, i + 1, num_training_batches)) gc.collect() if not cores: cores = multiprocessing.cpu_count() random_forest = RandomForestClassifier(n_estimators=rf_model.num_estimators, n_jobs=cores) random_forest.fit(features, targets) weights_filepath = os.path.join(weights_dir, "weights." + rf_model.name + ".fold_" + fold + ".pkl") with open(weights_filepath, 'w') as f: pickle.dump(random_forest, f, pickle.HIGHEST_PROTOCOL) return random_forest
def train(features_filepath, weights_dir, sgd_params, base_model, start_fold=None, end_fold=5, timestep=10, batch_size=1, iccv_epic=False): np.random.seed(42) users = IO.load_annotations(ntcir.filepaths) sorted_users = ntcir.utils.sort(users) num_frames_per_day = 2880 sequences = ntcir.get_sequences(sorted_users, num_frames_per_day) backend = 'tf' if K.backend() == 'tensorflow' else 'th' if not start_fold: start_fold = current_fold(weights_dir, base_model.name) folds = [str(fold).zfill(2) for fold in range(start_fold, end_fold + 1)] for fold in folds: with open(features_filepath.format(fold), 'r') as f: features = pickle.load(f) if iccv_epic: train_split = ntcir.read_split('datasets/ntcir/training_split.txt') test_split = ntcir.read_split('datasets/ntcir/validation_split.txt') else: train_split = ntcir.get_split_fold(sorted_users, int(fold) - 1) test_split = ntcir.get_split_fold(sorted_users, int(fold) - 1, False) training_batches = ntcir.get_training_batches(train_split, sequences, timestep=timestep) test_batches = ntcir.get_batches(test_split, sequences, timestep=timestep) K.set_learning_phase(1) model = base_model.load(feature_vector_length=base_model.feature_vector_length, timestep=timestep) sgd = SGD(lr=sgd_params.lr, decay=sgd_params.decay, momentum=sgd_params.momentum, nesterov=sgd_params.nesterov) model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy']) steps_per_epoch = int(len(training_batches) / batch_size) train_generator = generate_batch(features, training_batches, base_model.feature_vector_length, batch_size, timestep, steps_per_epoch) validation_steps = int(len(test_batches) / batch_size) validation_generator = generate_batch(features, test_batches, base_model.feature_vector_length, batch_size, timestep, validation_steps) # checkpoint base_model_weights = "weights." + base_model.name + ".fold_" + fold + ".epoch_{epoch:02d}." + backend + ".hdf5" weights_filepath = os.path.join(weights_dir, base_model_weights) checkpoint = ModelCheckpoint(weights_filepath, monitor='val_acc', verbose=1, save_best_only=False) history = HistoryLog() # fine-tune the model model.fit_generator( train_generator, steps_per_epoch=steps_per_epoch, epochs=10, callbacks=[checkpoint, history], validation_data=validation_generator, validation_steps=validation_steps) ts = time() timestamp = datetime.fromtimestamp(ts).strftime('%Y-%m-%d_%H:%M:%S') loss_filepath = os.path.join(weights_dir, "{}.fold_{}.loss.{}.log".format(base_model.name, fold, timestamp)) history.log_training_loss(loss_filepath) epoch_filepath = os.path.join(weights_dir, "{}.fold_{}.epoch.{}.log".format(base_model.name, fold, timestamp)) history.log_epoch(epoch_filepath) del model if K.backend() == 'tensorflow': K.clear_session()
import numpy as np import ntcir import ntcir.IO as IO import os import os.path as osp import itertools import utils import shutil from collections import defaultdict from easydict import EasyDict as edict # In[2]: users = IO.load_annotations(ntcir.filepaths) sorted_users = ntcir.utils.sort(users) categories = IO.load_categories(ntcir.filepaths) users_ids = sorted(users.keys()) days = defaultdict(lambda: defaultdict(ntcir.Day)) for user in sorted_users: for day in user.days: days[user.id_][day.date] = day splits = edict({'train': 0, 'validation': 1, 'test': 2}) # # Classification dataset split # In[ ]:
def test(features_filepath, results_dir, rf_model, start_fold=1, end_fold=1, timestep=5, progress_percent=0.05, iccv_epic=True, features_size=4096): np.random.seed(42) users = IO.load_annotations(ntcir.filepaths) sorted_users = ntcir.utils.sort(users) num_frames_per_day = 2880 sequences = ntcir.get_sequences(sorted_users, num_frames_per_day) folds = [str(fold).zfill(2) for fold in range(start_fold, end_fold + 1)] for fold in folds: with open(features_filepath.format(fold), 'r') as f: user_features = pickle.load(f) weights = rf_model.weights.format(fold) rf = load_random_forest(weights) if iccv_epic: test_split = ntcir.read_split('datasets/ntcir/test_split.txt') else: test_split = ntcir.get_split_fold(sorted_users, int(fold) - 1, False) test_batches = ntcir.get_training_batches(test_split, sequences, timestep=timestep) num_features = timestep * features_size num_test_batches = len(test_batches) features = np.zeros((num_test_batches, num_features)) img_paths = list() labels = list() for i, batch in enumerate(test_batches): day = user_features[batch.user_id][batch.date] for j, ind in enumerate(batch.indices): image = day.images[ind] start_ind = j * features_size end_ind = (j + 1) * features_size features[i, start_ind:end_ind] = image.features last_ind = batch.indices[-1] img_paths.append(day.images[last_ind].path) labels.append(day.images[last_ind].label) predictions = rf.predict(features) results = list() for i in range(num_test_batches): results.append((img_paths[i], labels[i], predictions[i])) #ORIGINAL # num_features = timestep * features_size # num_test_batches = len(test_batches) # # if progress_percent: # test_progress_percent = int(num_test_batches * progress_percent) # print "Testing fold {}".format(fold) # # results = list() # features = np.zeros(num_features) # for i, batch in enumerate(test_batches): # day = user_features[batch.user_id][batch.date] # for j, ind in enumerate(batch.indices): # image = day.images[ind] # start_ind = j * features_size # end_ind = (j + 1) * features_size # features[start_ind:end_ind] = image.features # # last_ind = batch.indices[-1] # img_path = day.images[last_ind].path # label = day.images[last_ind].label # prediction = rf.predict([features])[0].astype(np.int) # # results.append((img_path, label, prediction)) # if progress_percent and (i + 1) % test_progress_percent == 0: # print("Progress %3.2f%% (%d/%d)" % ((i + 1) / num_test_batches * 100, i + 1, num_test_batches)) results_fname = "{}.fold_{}.{}.csv".format(rf_model.name, fold, backend) results_filepath = os.path.join(results_dir, results_fname) write_results(results, results_filepath)
def extract_castro_features(cnn_model, data_dir, features_dir, start_fold=1, end_fold=5, num_categories=21, num_bins=10, progress_percent=.05): backend = 'tf' if K.backend() == 'tensorflow' else 'th' target_size = (cnn_model.img_height, cnn_model.img_width) datagen = ImageDataGenerator(rescale=1. / 255) folds = [str(fold).zfill(2) for fold in range(start_fold, end_fold + 1)] for fold in folds: weights = cnn_model.weights.format(fold) model = cnn_model.load(weights=weights) users = IO.load_annotations(ntcir.filepaths) ind_by_img_path = dict() for user_id, days in users.iteritems(): for date, day in days.iteritems(): for ind, image in enumerate(day.images): relative_path = '/'.join(image.path.split('/')[-3:]) ind_by_img_path[relative_path] = ind test_dir = os.path.join(data_dir, fold, 'test') train_dir = os.path.join(data_dir, fold, 'train') validation_dir = os.path.join(data_dir, fold, 'validation') if os.path.isdir(validation_dir): images = read_fold_dir(train_dir) + read_fold_dir( test_dir) + read_fold_dir(validation_dir) else: images = read_fold_dir(train_dir) + read_fold_dir(test_dir) num_images = len(images) images_progress_percent = int(num_images * progress_percent) print 'Extracting temporal features on fold {} for {}'.format( fold, cnn_model.name) for i, (label, img_path) in enumerate(images): img = load_image(datagen, img_path, target_size) features = np.zeros((num_categories + 3 * num_bins + 3)) features[:num_categories] = model.predict(img) features[num_categories] = image.hour features[num_categories + 1] = image.minute features[num_categories + 2] = image.weekday features[num_categories + 3:] = get_histogram(image.path, num_bins) rpath = os.path.realpath(img_path) user_id, date, filename = rpath.split('/')[-3:] relative_path = '/'.join([user_id, date, filename]) img_ind = ind_by_img_path[relative_path] image = users[user_id][date].images[img_ind] image.features = features if progress_percent and (i + 1) % images_progress_percent == 0: print("Progress %3.2f%% (%d/%d)" % ((i + 1) / num_images * 100, i + 1, num_images)) features_filepath = "features.{}.fold_{}.{}.pkl".format( rf_model.name, fold, backend) features_filepath = os.path.join(features_dir, features_filepath) with open(features_filepath, 'w') as f: pickle.dump(users, f, pickle.HIGHEST_PROTOCOL) del model if K.backend() == 'tensorflow': K.clear_session()
def extract_rf_features(data_dir, features_dir, cnn_model, rf_model, start_fold=1, end_fold=5, progress_percent=.1): backend = 'tf' if K.backend() == 'tensorflow' else 'th' target_size = (cnn_model.img_height, cnn_model.img_width) datagen = ImageDataGenerator(rescale=1. / 255) folds = [str(fold).zfill(2) for fold in range(start_fold, end_fold + 1)] for fold in folds: weights = cnn_model.weights.format(fold) base_model = cnn_model.load(weights=weights) layers_by_name = {l.name: l for l in base_model.layers} outputs = [layers_by_name[rf_model.layer].output] model = Model(inputs=base_model.input, outputs=outputs) weights = rf_model.weights.format(fold) rf = load_random_forest(weights) users = IO.load_annotations(ntcir.filepaths) ind_by_img_path = dict() for user_id, days in users.iteritems(): for date, day in days.iteritems(): for ind, image in enumerate(day.images): relative_path = '/'.join(image.path.split('/')[-3:]) ind_by_img_path[relative_path] = ind test_dir = os.path.join(data_dir, fold, 'test') train_dir = os.path.join(data_dir, fold, 'train') validation_dir = os.path.join(data_dir, fold, 'validation') if os.path.isdir(validation_dir): images = read_fold_dir(train_dir) + read_fold_dir( test_dir) + read_fold_dir(validation_dir) else: images = read_fold_dir(train_dir) + read_fold_dir(test_dir) num_images = len(images) images_progress_percent = int(num_images * progress_percent) print 'Extracting temporal features on fold {} for {} + RF on layer {}'.format( fold, cnn_model.name, rf_model.layer) for i, (label, img_path) in enumerate(images): img = load_image(datagen, img_path, target_size) predictions = model.predict(img) # Concatenating features features = predictions[0].copy() probability = rf.predict_proba([features])[0] rpath = os.path.realpath(img_path) user_id, date, filename = rpath.split('/')[-3:] relative_path = '/'.join([user_id, date, filename]) img_ind = ind_by_img_path[relative_path] image = users[user_id][date].images[img_ind] image.features = probability.copy() if progress_percent and (i + 1) % images_progress_percent == 0: print("Progress %3.2f%% (%d/%d)" % ((i + 1) / num_images * 100, i + 1, num_images)) features_filepath = "features.{}.fold_{}.{}.pkl".format( rf_model.name, fold, backend) features_filepath = os.path.join(features_dir, features_filepath) with open(features_filepath, 'w') as f: pickle.dump(users, f, pickle.HIGHEST_PROTOCOL) del model if K.backend() == 'tensorflow': K.clear_session()
def test(features_filepath, results_dir, base_model, start_fold, end_fold, timestep, iccv_epic=False): users = IO.load_annotations(ntcir.filepaths) sorted_users = ntcir.utils.sort(users) num_frames_per_day = 2880 sequences = ntcir.get_sequences(sorted_users, num_frames_per_day) backend = 'tf' if K.backend() == 'tensorflow' else 'th' if not start_fold: start_fold = current_fold(results_dir, base_model.name + '.fold') folds = [str(fold).zfill(2) for fold in range(start_fold, end_fold + 1)] for fold in folds: with open(features_filepath.format(fold), 'r') as f: features = pickle.load(f) if iccv_epic: test_split = ntcir.read_split('datasets/ntcir/test_split.txt') else: test_split = ntcir.get_split_fold(sorted_users, int(fold) - 1, False) test_batches = ntcir.get_batches(test_split, sequences, timestep=timestep, include_last=True) K.set_learning_phase(False) weights = base_model.best_weights.format(fold) model = base_model.load( feature_vector_length=base_model.feature_vector_length, weights=weights, timestep=timestep) frames = list() groundtruth = list() predictions = list() for i, batch in enumerate(test_batches): x, y = load_batch( features, batch, feature_vector_length=base_model.feature_vector_length, batch_size=1, timestep=timestep) prediction = model.predict_on_batch(x) prediction = np.argmax(prediction, axis=2).squeeze()[0:batch.size] predictions.extend(prediction) groundtruth.extend(np.argmax(y, axis=2).squeeze()[0:batch.size]) for j, ind in enumerate(batch.indices): image = features[batch.user_id][batch.date].images[ind] frames.append(image.path) results_fname = "{}.fold_{}.{}.csv".format(base_model.name, fold, backend) results_filepath = os.path.join(results_dir, results_fname) write_results(frames, groundtruth, predictions, results_filepath) del model if K.backend() == 'tensorflow': K.clear_session()
def test(features_filepath, results_dir, base_model, start_fold, end_fold, timestep, iccv_epic=False, progress_percent=0.05): users = IO.load_annotations(ntcir.filepaths) sorted_users = ntcir.utils.sort(users) num_frames_per_day = 2880 sequences = ntcir.get_sequences(sorted_users, num_frames_per_day) backend = 'tf' if K.backend() == 'tensorflow' else 'th' if not start_fold: start_fold = current_fold(results_dir, base_model.name + '.fold') folds = [str(fold).zfill(2) for fold in range(start_fold, end_fold + 1)] for fold in folds: with open(features_filepath.format(fold), 'r') as f: features = pickle.load(f) if iccv_epic: test_split = ntcir.read_split('datasets/ntcir/test_split.txt') else: test_split = ntcir.get_split_fold(sorted_users, int(fold) - 1, False) test_batches = ntcir.get_training_batches(test_split, sequences, timestep=timestep) K.set_learning_phase(False) weights = base_model.best_weights.format(fold) model = base_model.load( feature_vector_length=base_model.feature_vector_length, weights=weights, timestep=timestep) num_test_batches = len(test_batches) if progress_percent: test_progress_percent = int(num_test_batches * progress_percent) print "Testing fold {}".format(fold) results = list() for i, batch in enumerate(test_batches): x, y = load_batch( features, batch, feature_vector_length=base_model.feature_vector_length, batch_size=1, timestep=timestep) prediction = model.predict_on_batch(x) prediction = np.argmax(prediction, axis=2).squeeze()[-1] ind = batch.indices[-1] image = features[batch.user_id][batch.date].images[ind] results.append((image.path, image.label, prediction)) if progress_percent and (i + 1) % test_progress_percent == 0: print("Progress %3.2f%% (%d/%d)" % ( (i + 1) / num_test_batches * 100, i + 1, num_test_batches)) results_fname = "{}.many2one.fold_{}.{}.csv".format( base_model.name, fold, backend) results_filepath = os.path.join(results_dir, results_fname) write_results(results, results_filepath) del model if K.backend() == 'tensorflow': K.clear_session()