def main_video(): trainLabels, trainVideoFeatures = gather_features('train', return_plot=False) valLabels, valVideoFeatures = gather_features('val', return_plot=False) # train_classifier_video(valVideoFeatures, valLabels) train_classifier_video(trainVideoFeatures, trainLabels, valVideoFeatures, valLabels)
def main_text(): trainLabels, trainPlotFeatures = gather_features('train', return_video=False, reverse=True) valLabels, valPlotFeatures = gather_features('val', return_video=False, reverse=True) # to transpose the input, to get two lists of corresponding text & reversed trainPlotFeatures = np.array(map(list, zip(*trainPlotFeatures))) valPlotFeatures = np.array(map(list, zip(*valPlotFeatures))) # train_classifier_word_embedding(valPlotFeatures, valLabels) train_classifier_word_embedding(trainPlotFeatures, trainLabels, valPlotFeatures, valLabels)
def main_vislang(): trainLabels, trainPlotFeatures, trainVideoFeatures = gather_features( 'train', reverse=True) valLabels, valPlotFeatures, valVideoFeatures = gather_features( 'val', reverse=True) trainPlotFeatures = np.array(map(list, zip(*trainPlotFeatures))) valPlotFeatures = np.array(map(list, zip(*valPlotFeatures))) # train_classifier_vislang(valVideoFeatures, valPlotFeatures, valLabels, merge_mode='outer') train_classifier_vislang(trainVideoFeatures, trainPlotFeatures, trainLabels, valVideoFeatures, valPlotFeatures, valLabels, merge_mode=argv[2])
def generate_precision_recall_text(mode='val'): model = load_moviescope_model('text') yTrue, plotFeatures = gather_features(mode, return_video=False, reverse=True) plotFeatures = np.array(map(list, zip(*plotFeatures))) yPreds = model.predict([plotFeatures[0], plotFeatures[1]]) dump_pkl((yTrue, yPreds), mode+'_pred_text') return
def generate_precision_recall_video(mode='val'): model = load_moviescope_model('wiki_im_video_sgd') yTrue, videoFeatures = gather_features(mode, return_plot=False) _, videoFeatures = augment_labels_lstm(yTrue, videoFeatures, 200) yPreds = model.predict(videoFeatures) dump_pkl((yTrue, yPreds), mode+'_pred_video_sgd') return
def generate_precision_recall_vislang(mode='val', merge_mode='sum'): if merge_mode == 'bilinear': model = vislang_model(merge_mode) model.load_weights('data/weights/weights_min_loss_%s.h5' % merge_mode) else: model = load_moviescope_model('eq_VisLang_%s' % merge_mode) yTrue, plotFeatures, videoFeatures = gather_features(mode, reverse=True) plotFeatures = np.array(map(list, zip(*plotFeatures))) _, videoFeatures = augment_labels_lstm(yTrue, videoFeatures, 200) yPreds = model.predict([videoFeatures, plotFeatures[0], plotFeatures[1]]) dump_pkl((yTrue, yPreds), mode+'_pred_eq_vislang_'+merge_mode)
def return_confident_results(mode='val'): model = load_moviescope_model('wiki_im_VisLang') genrePredictionDict = dict((i,[]) for i in range(26)) textObj = load_pkl('plot_object_train') labels, plotFeatures, videoFeatures, movieIds = gather_features(mode, return_id=True) _, videoFeatures = augment_labels_lstm(labels, videoFeatures, 200) predictionScores = model.predict([videoFeatures, plotFeatures]) for index in range(len(predictionScores)): for i in range(26): genrePredictionDict[i].append((predictionScores[index][i],movieIds[index])) dump_pkl(genrePredictionDict, 'genrePredictionDict_'+mode) for i in range(26): print sorted(genrePredictionDict[i], reverse=True)[:10] return
def fine_tune_merge_only(merge_mode='sum'): if False: trainLabels = gather_features(mode='train', return_plot=False, return_video=False) valLabels = gather_features(mode='val', return_plot=False, return_video=False) dump_pkl(trainLabels, 'trainLabels') dump_pkl(valLabels, 'valLabels') else: trainLabels = load_pkl('trainLabels') valLabels = load_pkl('valLabels') train_visFeatures = load_pkl('train_visFeatures') train_textFeatures = load_pkl('train_textFeatures') val_visFeatures = load_pkl('val_visFeatures') val_textFeatures = load_pkl('val_textFeatures') visInput = Input(shape=(64, )) textInput = Input(shape=(64, )) if merge_mode in ['sum', 'concat', 'mul', 'outer']: if merge_mode == 'outer': vislangModel = Lambda(bilinear_projection, output_shape=(4096, ))([visInput, textInput]) else: vislangModel = merge([visInput, textInput], mode=merge_mode) if merge_mode == 'sum': vislangModel = Dense(1000, activation='relu')(vislangModel) vislangModel = Dropout(0.5)(vislangModel) vislangModel = Dense(8, activation='relu')(vislangModel) elif merge_mode == 'concat': vislangModel = Dense(512, activation='relu')(vislangModel) vislangModel = Dropout(0.5)(vislangModel) vislangModel = Dense(14, activation='relu')(vislangModel) elif merge_mode == 'outer': vislangModel = Dense(16, activation='relu')(vislangModel) vislangModel = Dropout(0.5)(vislangModel) vislangModel = Dense(256, activation='relu')(vislangModel) vislangModel = Dense(number_of_classes, activation='sigmoid')(vislangModel) else: vislangModel = BilinearTensorLayer(input_dim=64)([visInput, textInput]) sgd = SGD(lr=0.1, decay=0.00001, momentum=0.9, nesterov=True) model = Model(input=[visInput, textInput], output=[vislangModel]) model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy']) checkpoint = ModelCheckpoint(filepath='./data/models/ft_vislang_%s.h5' % merge_mode, monitor='val_acc', verbose=1, save_best_only=True, mode='max') checkpoint_loss = ModelCheckpoint( filepath='./data/weights/ft_weights_min_loss_%s.h5' % merge_mode, monitor='val_loss', save_weights_only=True, mode='min') callbacks_list = [checkpoint, remote, checkpoint_loss] model.load_weights('data/weights/ft_weights_min_loss_%s.h5' % merge_mode) hist = model.fit(x=[train_visFeatures, train_textFeatures], y=trainLabels, validation_data=([val_visFeatures, val_textFeatures], valLabels), nb_epoch=50, batch_size=128, callbacks=callbacks_list) histDict = hist.history dump_pkl(histDict, 'hist_ft_vislang_%s' % merge_mode)
def test_vislang(mode='val'): model = load_moviescope_model('wiki_im_vislang') labels, plotFeatures, videoFeatures = gather_features(mode) evaluate_vislang(model, videoFeatures, plotFeatures, labels)
def test(mode='val'): valLabels, plotFeatures, videoFeatures = gather_features(mode) plotModel = load_moviescope_model(best_plot_model) videoModel = load_moviescope_model(best_video_model) evaluate_text(plotModel, plotFeatures, valLabels) evaluate_visual(videoModel, videoFeatures, valLabels)
def test_text(): valLabels, valFeatures = gather_features(mode='val', return_video=False) model = load_moviescope_model(best_plot_model) evaluate_text(model, valFeatures, valLabels)
def test_visual(): valLabels, valFeatures = gather_features('val', return_plot=False) model = load_moviescope_model(best_video_model) evaluate_visual(model, valFeatures, valLabels)